aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2008-07-18 02:39:39 -0700
committerDavid S. Miller <davem@davemloft.net>2008-07-18 02:39:39 -0700
commit49997d75152b3d23c53b0fa730599f2f74c92c65 (patch)
tree46e93126170d02cfec9505172e545732c1b69656 /fs
parenta0c80b80e0fb48129e4e9d6a9ede914f9ff1850d (diff)
parent5b664cb235e97afbf34db9c4d77f08ebd725335e (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: Documentation/powerpc/booting-without-of.txt drivers/atm/Makefile drivers/net/fs_enet/fs_enet-main.c drivers/pci/pci-acpi.c net/8021q/vlan.c net/iucv/iucv.c
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c18
-rw-r--r--fs/Kconfig151
-rw-r--r--fs/Makefile2
-rw-r--r--fs/bio-integrity.c719
-rw-r--r--fs/bio.c88
-rw-r--r--fs/buffer.c21
-rw-r--r--fs/char_dev.c7
-rw-r--r--fs/cifs/cifsacl.c10
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/inode.c20
-rw-r--r--fs/compat_ioctl.c6
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c147
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/symlink.c16
-rw-r--r--fs/dlm/config.c45
-rw-r--r--fs/dlm/user.c9
-rw-r--r--fs/ecryptfs/file.c3
-rw-r--r--fs/ecryptfs/miscdev.c2
-rw-r--r--fs/exec.c2
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c146
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/file.c6
-rw-r--r--fs/fat/inode.c26
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/gfs2/Kconfig18
-rw-r--r--fs/gfs2/Makefile1
-rw-r--r--fs/gfs2/gfs2.h5
-rw-r--r--fs/gfs2/glock.c1643
-rw-r--r--fs/gfs2/glock.h11
-rw-r--r--fs/gfs2/glops.c70
-rw-r--r--fs/gfs2/incore.h38
-rw-r--r--fs/gfs2/inode.c11
-rw-r--r--fs/gfs2/inode.h2
-rw-r--r--fs/gfs2/locking.c52
-rw-r--r--fs/gfs2/locking/dlm/lock.c368
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h18
-rw-r--r--fs/gfs2/locking/dlm/mount.c14
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c13
-rw-r--r--fs/gfs2/locking/dlm/thread.c331
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c238
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/ops_address.c40
-rw-r--r--fs/gfs2/ops_file.c42
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/ops_inode.c25
-rw-r--r--fs/gfs2/ops_super.c4
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/recovery.c5
-rw-r--r--fs/gfs2/rgrp.c108
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/sys.c16
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/jfs/jfs_debug.c62
-rw-r--r--fs/jfs/jfs_debug.h10
-rw-r--r--fs/jfs/jfs_dtree.h3
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c35
-rw-r--r--fs/jfs/jfs_metapage.c36
-rw-r--r--fs/jfs/jfs_txnmgr.c68
-rw-r--r--fs/jfs/jfs_xtree.c36
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c7
-rw-r--r--fs/libfs.c28
-rw-r--r--fs/lockd/clntproc.c8
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/mpage.c14
-rw-r--r--fs/msdos/namei.c35
-rw-r--r--fs/namespace.c14
-rw-r--r--fs/ncpfs/file.c12
-rw-r--r--fs/nfs/callback.c34
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c90
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c161
-rw-r--r--fs/nfs/inode.c79
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/iostat.h119
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c275
-rw-r--r--fs/nfs/nfs4proc.c265
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfsroot.c10
-rw-r--r--fs/nfs/proc.c28
-rw-r--r--fs/nfs/super.c882
-rw-r--r--fs/nfs/write.c322
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/ocfs2/aops.c13
-rw-r--r--fs/ocfs2/cluster/heartbeat.c17
-rw-r--r--fs/ocfs2/cluster/netdebug.c8
-rw-r--r--fs/ocfs2/cluster/nodemanager.c45
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlmglue.c136
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/localalloc.c2
-rw-r--r--fs/ocfs2/ocfs2.h12
-rw-r--r--fs/ocfs2/ocfs2_fs.h2
-rw-r--r--fs/ocfs2/stack_user.c22
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/open.c37
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/proc/proc_misc.c16
-rw-r--r--fs/proc/task_mmu.c86
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/ramfs/file-mmu.c1
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/smbfs/file.c11
-rw-r--r--fs/splice.c17
-rw-r--r--fs/ubifs/Kconfig72
-rw-r--r--fs/ubifs/Makefile9
-rw-r--r--fs/ubifs/budget.c731
-rw-r--r--fs/ubifs/commit.c677
-rw-r--r--fs/ubifs/compress.c253
-rw-r--r--fs/ubifs/debug.c2289
-rw-r--r--fs/ubifs/debug.h403
-rw-r--r--fs/ubifs/dir.c1240
-rw-r--r--fs/ubifs/file.c1275
-rw-r--r--fs/ubifs/find.c975
-rw-r--r--fs/ubifs/gc.c773
-rw-r--r--fs/ubifs/io.c914
-rw-r--r--fs/ubifs/ioctl.c204
-rw-r--r--fs/ubifs/journal.c1387
-rw-r--r--fs/ubifs/key.h533
-rw-r--r--fs/ubifs/log.c805
-rw-r--r--fs/ubifs/lprops.c1357
-rw-r--r--fs/ubifs/lpt.c2243
-rw-r--r--fs/ubifs/lpt_commit.c1648
-rw-r--r--fs/ubifs/master.c387
-rw-r--r--fs/ubifs/misc.h342
-rw-r--r--fs/ubifs/orphan.c958
-rw-r--r--fs/ubifs/recovery.c1519
-rw-r--r--fs/ubifs/replay.c1075
-rw-r--r--fs/ubifs/sb.c629
-rw-r--r--fs/ubifs/scan.c362
-rw-r--r--fs/ubifs/shrinker.c322
-rw-r--r--fs/ubifs/super.c1951
-rw-r--r--fs/ubifs/tnc.c2956
-rw-r--r--fs/ubifs/tnc_commit.c1103
-rw-r--r--fs/ubifs/tnc_misc.c494
-rw-r--r--fs/ubifs/ubifs-media.h745
-rw-r--r--fs/ubifs/ubifs.h1649
-rw-r--r--fs/ubifs/xattr.c581
-rw-r--r--fs/vfat/namei.c35
-rw-r--r--fs/xfs/xfs_log.c15
182 files changed, 39659 insertions, 4464 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada..57997fa14e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
void v9fs_dentry_release(struct dentry *);
-int v9fs_uflags2omode(int uflags);
+int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 0d55affe37d..52944d2249a 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- omode = v9fs_uflags2omode(file->f_flags);
+ omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
inode->i_size = 0;
inode->i_blocks = 0;
}
+ if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+ generic_file_llseek(file, 0, SEEK_END);
}
file->private_data = fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 40fa807bd92..c95295c6504 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
/**
* v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
* @uflags: flags to convert
- *
+ * @extended: if .u extensions are active
*/
-int v9fs_uflags2omode(int uflags)
+int v9fs_uflags2omode(int uflags, int extended)
{
int ret;
@@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
break;
}
- if (uflags & O_EXCL)
- ret |= P9_OEXCL;
-
if (uflags & O_TRUNC)
ret |= P9_OTRUNC;
- if (uflags & O_APPEND)
- ret |= P9_OAPPEND;
+ if (extended) {
+ if (uflags & O_EXCL)
+ ret |= P9_OEXCL;
+
+ if (uflags & O_APPEND)
+ ret |= P9_OAPPEND;
+ }
return ret;
}
@@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
flags = O_RDWR;
fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags));
+ v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
fid = NULL;
diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c..37db79a2ff9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -470,6 +470,14 @@ config OCFS2_FS_USERSPACE_CLUSTER
It is safe to say Y, as the clustering method is run-time
selectable.
+config OCFS2_FS_STATS
+ bool "OCFS2 statistics"
+ depends on OCFS2_FS
+ default y
+ help
+ This option allows some fs statistics to be captured. Enabling
+ this option may increase the memory consumption.
+
config OCFS2_DEBUG_MASKLOG
bool "OCFS2 logging support"
depends on OCFS2_FS
@@ -830,7 +838,7 @@ config NTFS_FS
from the project web site.
For more information see <file:Documentation/filesystems/ntfs.txt>
- and <http://linux-ntfs.sourceforge.net/>.
+ and <http://www.linux-ntfs.org/>.
To compile this file system support as a module, choose M here: the
module will be called ntfs.
@@ -930,7 +938,7 @@ config PROC_KCORE
config PROC_VMCORE
bool "/proc/vmcore support (EXPERIMENTAL)"
- depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+ depends on PROC_FS && CRASH_DUMP
default y
help
Exports the dump image of crashed kernel in ELF format.
@@ -1375,6 +1383,9 @@ config JFFS2_CMODE_FAVOURLZO
endchoice
+# UBIFS File system configuration
+source "fs/ubifs/Kconfig"
+
config CRAMFS
tristate "Compressed ROM file system support (cramfs)"
depends on BLOCK
@@ -1544,10 +1555,6 @@ config UFS_FS
The recently released UFS2 variant (used in FreeBSD 5.x) is
READ-ONLY supported.
- If you only intend to mount files from some other Unix over the
- network using NFS, you don't need the UFS file system support (but
- you need NFS file system support obviously).
-
Note that this option is generally not needed for floppies, since a
good portable way to transport files and directories between unixes
(and even other operating systems) is given by the tar program ("man
@@ -1587,6 +1594,7 @@ menuconfig NETWORK_FILESYSTEMS
Say Y here to get to see options for network filesystems and
filesystem-related networking code, such as NFS daemon and
RPCSEC security modules.
+
This option alone does not add any kernel code.
If you say N, all options in this submenu will be skipped and
@@ -1595,76 +1603,92 @@ menuconfig NETWORK_FILESYSTEMS
if NETWORK_FILESYSTEMS
config NFS_FS
- tristate "NFS file system support"
+ tristate "NFS client support"
depends on INET
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
help
- If you are connected to some other (usually local) Unix computer
- (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
- on that computer (the NFS server) using the Network File Sharing
- protocol, say Y. "Mounting files" means that the client can access
- the files with usual UNIX commands as if they were sitting on the
- client's hard disk. For this to work, the server must run the
- programs nfsd and mountd (but does not need to have NFS file system
- support enabled in its kernel). NFS is explained in the Network
- Administrator's Guide, available from
- <http://www.tldp.org/docs.html#guide>, on its man page: "man
- nfs", and in the NFS-HOWTO.
-
- A superior but less widely used alternative to NFS is provided by
- the Coda file system; see "Coda file system support" below.
+ Choose Y here if you want to access files residing on other
+ computers using Sun's Network File System protocol. To compile
+ this file system support as a module, choose M here: the module
+ will be called nfs.
- If you say Y here, you should have said Y to TCP/IP networking also.
- This option would enlarge your kernel by about 27 KB.
+ To mount file systems exported by NFS servers, you also need to
+ install the user space mount.nfs command which can be found in
+ the Linux nfs-utils package, available from http://linux-nfs.org/.
+ Information about using the mount command is available in the
+ mount(8) man page. More detail about the Linux NFS client
+ implementation is available via the nfs(5) man page.
- To compile this file system support as a module, choose M here: the
- module will be called nfs.
+ Below you can choose which versions of the NFS protocol are
+ available in the kernel to mount NFS servers. Support for NFS
+ version 2 (RFC 1094) is always available when NFS_FS is selected.
- If you are configuring a diskless machine which will mount its root
- file system over NFS at boot time, say Y here and to "Kernel
- level IP autoconfiguration" above and to "Root file system on NFS"
- below. You cannot compile this driver as a module in this case.
- There are two packages designed for booting diskless machines over
- the net: netboot, available from
- <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
- available from <http://ftp1.sourceforge.net/etherboot/>.
+ To configure a system which mounts its root file system via NFS
+ at boot time, say Y here, select "Kernel level IP
+ autoconfiguration" in the NETWORK menu, and select "Root file
+ system on NFS" below. You cannot compile this file system as a
+ module in this case.
- If you don't know what all this is about, say N.
+ If unsure, say N.
config NFS_V3
- bool "Provide NFSv3 client support"
+ bool "NFS client support for NFS version 3"
depends on NFS_FS
help
- Say Y here if you want your NFS client to be able to speak version
- 3 of the NFS protocol.
+ This option enables support for version 3 of the NFS protocol
+ (RFC 1813) in the kernel's NFS client.
If unsure, say Y.
config NFS_V3_ACL
- bool "Provide client support for the NFSv3 ACL protocol extension"
+ bool "NFS client support for the NFSv3 ACL protocol extension"
depends on NFS_V3
help
- Implement the NFSv3 ACL protocol extension for manipulating POSIX
- Access Control Lists. The server should also be compiled with
- the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+ Some NFS servers support an auxiliary NFSv3 ACL protocol that
+ Sun added to Solaris but never became an official part of the
+ NFS version 3 protocol. This protocol extension allows
+ applications on NFS clients to manipulate POSIX Access Control
+ Lists on files residing on NFS servers. NFS servers enforce
+ ACLs on local files whether this protocol is available or not.
+
+ Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+ protocol extension and you want your NFS client to allow
+ applications to access and modify ACLs on files on the server.
+
+ Most NFS servers don't support the Solaris NFSv3 ACL protocol
+ extension. You can choose N here or specify the "noacl" mount
+ option to prevent your NFS client from trying to use the NFSv3
+ ACL protocol.
If unsure, say N.
config NFS_V4
- bool "Provide NFSv4 client support (EXPERIMENTAL)"
+ bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
depends on NFS_FS && EXPERIMENTAL
select RPCSEC_GSS_KRB5
help
- Say Y here if you want your NFS client to be able to speak the newer
- version 4 of the NFS protocol.
+ This option enables support for version 4 of the NFS protocol
+ (RFC 3530) in the kernel's NFS client.
- Note: Requires auxiliary userspace daemons which may be found on
- http://www.citi.umich.edu/projects/nfsv4/
+ To mount NFS servers using NFSv4, you also need to install user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
If unsure, say N.
+config ROOT_NFS
+ bool "Root file system on NFS"
+ depends on NFS_FS=y && IP_PNP
+ help
+ If you want your system to mount its root file system via NFS,
+ choose Y here. This is common practice for managing systems
+ without local permanent storage. For details, read
+ <file:Documentation/filesystems/nfsroot.txt>.
+
+ Most people say N here.
+
config NFSD
tristate "NFS server support"
depends on INET
@@ -1746,20 +1770,6 @@ config NFSD_V4
If unsure, say N.
-config ROOT_NFS
- bool "Root file system on NFS"
- depends on NFS_FS=y && IP_PNP
- help
- If you want your Linux box to mount its whole root file system (the
- one containing the directory /) from some other computer over the
- net via NFS (presumably because your box doesn't have a hard disk),
- say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
- details. It is likely that in this case, you also want to say Y to
- "Kernel level IP autoconfiguration" so that your box can discover
- its network address at boot time.
-
- Most people say N here.
-
config LOCKD
tristate
@@ -1800,27 +1810,6 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
-config SUNRPC_BIND34
- bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
- depends on SUNRPC && EXPERIMENTAL
- default n
- help
- RPC requests over IPv6 networks require support for larger
- addresses when performing an RPC bind. Sun added support for
- IPv6 addressing by creating two new versions of the rpcbind
- protocol (RFC 1833).
-
- This option enables support in the kernel RPC client for
- querying rpcbind servers via versions 3 and 4 of the rpcbind
- protocol. The kernel automatically falls back to version 2
- if a remote rpcbind service does not support versions 3 or 4.
- By themselves, these new versions do not provide support for
- RPC over IPv6, but the new protocol versions are necessary to
- support it.
-
- If unsure, say N to get traditional behavior (version 2 rpcbind
- requests only).
-
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da..3b2178b4bb6 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
obj-y += no-block.o
endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
@@ -100,6 +101,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_UFS_FS) += ufs/
obj-$(CONFIG_EFS_FS) += efs/
obj-$(CONFIG_JFFS2_FS) += jffs2/
+obj-$(CONFIG_UBIFS_FS) += ubifs/
obj-$(CONFIG_AFFS_FS) += affs/
obj-$(CONFIG_ROMFS_FS) += romfs/
obj-$(CONFIG_QNX4FS_FS) += qnx4/
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 00000000000..63e2ee63058
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ * @bs: bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip;
+ struct bio_vec *iv;
+ unsigned long idx;
+
+ BUG_ON(bio == NULL);
+
+ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+ return NULL;
+ }
+
+ memset(bip, 0, sizeof(*bip));
+
+ iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+ if (unlikely(iv == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+ mempool_free(bip, bs->bio_integrity_pool);
+ return NULL;
+ }
+
+ bip->bip_pool = idx;
+ bip->bip_vec = iv;
+ bip->bip_bio = bio;
+ bio->bi_integrity = bip;
+
+ return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs)
+{
+ return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio: bio containing bip to be freed
+ * @bs: bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip == NULL);
+
+ /* A cloned bio doesn't own the integrity metadata */
+ if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+ kfree(bip->bip_buf);
+
+ mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+ mempool_free(bip, bs->bio_integrity_pool);
+
+ bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio: bio to update
+ * @page: page containing integrity metadata
+ * @len: number of bytes of integrity metadata in page
+ * @offset: start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct bio_vec *iv;
+
+ if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+ printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ return 0;
+ }
+
+ iv = bip_vec_idx(bip, bip->bip_vcnt);
+ BUG_ON(iv == NULL);
+ BUG_ON(iv->bv_page != NULL);
+
+ iv->bv_page = page;
+ iv->bv_len = len;
+ iv->bv_offset = offset;
+ bip->bip_vcnt++;
+
+ return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio: bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not. bio data direction and target device must be
+ * set prior to calling. The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return 0;
+
+ return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi: blk_integrity profile for device
+ * @sectors: Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device. Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ /* At this point there are only 512b or 4096b DIF/EPP devices */
+ if (bi->sector_size == 4096)
+ return sectors >>= 3;
+
+ return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio: bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+ BUG_ON(bio->bi_size == 0);
+
+ return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip->bip_buf == NULL);
+
+ if (bi->tag_size == 0)
+ return -1;
+
+ nr_sectors = bio_integrity_hw_sectors(bi,
+ DIV_ROUND_UP(len, bi->tag_size));
+
+ if (nr_sectors * bi->tuple_size > bip->bip_size) {
+ printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+ __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+ return -1;
+ }
+
+ if (set)
+ bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+ else
+ bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+ return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio: bio to attach buffer to
+ * @tag_buf: Pointer to a buffer containing tag data
+ * @len: Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection. The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != WRITE);
+
+ return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio: bio to retrieve buffer from
+ * @tag_buf: Pointer to a buffer for the tag data
+ * @len: Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != READ);
+
+ return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio: bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function. The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_sector;
+ unsigned int i, sectors, total;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ bi->generate_fn(&bix);
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio. The bio must have data
+ * direction, target device and start sector set priot to calling. In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function. In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+ struct bio_integrity_payload *bip;
+ struct blk_integrity *bi;
+ struct request_queue *q;
+ void *buf;
+ unsigned long start, end;
+ unsigned int len, nr_pages;
+ unsigned int bytes, offset, i;
+ unsigned int sectors;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ q = bdev_get_queue(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bio_integrity(bio));
+
+ sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+ /* Allocate kernel buffer for protection data */
+ len = sectors * blk_integrity_tuple_size(bi);
+ buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+ if (unlikely(buf == NULL)) {
+ printk(KERN_ERR "could not allocate integrity buffer\n");
+ return -EIO;
+ }
+
+ end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ((unsigned long) buf) >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ /* Allocate bio integrity payload and integrity vectors */
+ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "could not allocate data integrity bioset\n");
+ kfree(buf);
+ return -EIO;
+ }
+
+ bip->bip_buf = buf;
+ bip->bip_size = len;
+ bip->bip_sector = bio->bi_sector;
+
+ /* Map it */
+ offset = offset_in_page(buf);
+ for (i = 0 ; i < nr_pages ; i++) {
+ int ret;
+ bytes = PAGE_SIZE - offset;
+
+ if (len <= 0)
+ break;
+
+ if (bytes > len)
+ bytes = len;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf),
+ bytes, offset);
+
+ if (ret == 0)
+ return 0;
+
+ if (ret < bytes)
+ break;
+
+ buf += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+
+ /* Install custom I/O completion handler if read verify is enabled */
+ if (bio_data_dir(bio) == READ) {
+ bip->bip_end_io = bio->bi_end_io;
+ bio->bi_end_io = bio_integrity_endio;
+ }
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE)
+ bio_integrity_generate(bio);
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio: bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio. The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_integrity->bip_sector;
+ unsigned int i, sectors, total, ret;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ ret = total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ ret = bi->verify_fn(&bix);
+
+ if (ret) {
+ kunmap_atomic(kaddr, KM_USER0);
+ break;
+ }
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work: Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request. The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_payload *bip =
+ container_of(work, struct bio_integrity_payload, bip_work);
+ struct bio *bio = bip->bip_bio;
+ int error = bip->bip_error;
+
+ if (bio_integrity_verify(bio)) {
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ error = -EIO;
+ }
+
+ /* Restore original bio completion handler */
+ bio->bi_end_io = bip->bip_end_io;
+
+ if (bio->bi_end_io)
+ bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ * @error: Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context. However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context. This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip->bip_bio != bio);
+
+ bip->bip_error = error;
+ INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip: Integrity vector to advance
+ * @skip: Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+ unsigned int skip)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (skip == 0) {
+ bip->bip_idx = i;
+ return;
+ } else if (skip >= iv->bv_len) {
+ skip -= iv->bv_len;
+ } else { /* skip < iv->bv_len) */
+ iv->bv_offset += skip;
+ iv->bv_len -= skip;
+ bip->bip_idx = i;
+ return;
+ }
+ }
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip: Integrity vector to truncate
+ * @len: New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+ unsigned int len)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (len == 0) {
+ bip->bip_vcnt = i;
+ return;
+ } else if (len >= iv->bv_len) {
+ len -= iv->bv_len;
+ } else { /* len < iv->bv_len) */
+ iv->bv_len = len;
+ len = 0;
+ }
+ }
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio: bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+ bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio: bio whose integrity vector to update
+ * @offset: offset to first data sector
+ * @sectors: number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+ unsigned int sectors)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+ BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+ bip->bip_sector = bip->bip_sector + offset;
+ bio_integrity_mark_head(bip, offset * bi->tuple_size);
+ bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio: Protected bio
+ * @bp: Resulting bio_pair
+ * @sectors: Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+ struct blk_integrity *bi;
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ unsigned int nr_sectors;
+
+ if (bio_integrity(bio) == 0)
+ return;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bip->bip_vcnt != 1);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+ bp->bio1.bi_integrity = &bp->bip1;
+ bp->bio2.bi_integrity = &bp->bip2;
+
+ bp->iv1 = bip->bip_vec[0];
+ bp->iv2 = bip->bip_vec[0];
+
+ bp->bip1.bip_vec = &bp->iv1;
+ bp->bip2.bip_vec = &bp->iv2;
+
+ bp->iv1.bv_len = sectors * bi->tuple_size;
+ bp->iv2.bv_offset += sectors * bi->tuple_size;
+ bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+ bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+ bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+ bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+ bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio: New bio
+ * @bio_src: Original bio
+ * @bs: bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+ struct bio_integrity_payload *bip;
+
+ BUG_ON(bip_src == NULL);
+
+ bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+ if (bip == NULL)
+ return -EIO;
+
+ memcpy(bip->bip_vec, bip_src->bip_vec,
+ bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+ bip->bip_sector = bip_src->bip_sector;
+ bip->bip_vcnt = bip_src->bip_vcnt;
+ bip->bip_idx = bip_src->bip_idx;
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+ bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+ bio_integrity_slab);
+ if (!bs->bio_integrity_pool)
+ return -1;
+
+ return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+ if (bs->bio_integrity_pool)
+ mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+ bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+
+static int __init integrity_init(void)
+{
+ kintegrityd_wq = create_workqueue("kintegrityd");
+
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+
+ return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb5..88322b066ac 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
#include <linux/blktrace_api.h>
#include <scsi/sg.h> /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
-
static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
- int nr_vecs;
- char *name;
- struct kmem_cache *slab;
-};
-
/*
* if you change this list, also change bvec_alloc or things will
* break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
#undef BV
/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
- mempool_t *bio_pool;
- mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
-/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+ return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
{
struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
}
+ if (bio_integrity(bio))
+ bio_integrity_free(bio, bio_set);
+
mempool_free(bio, bio_set->bio_pool);
}
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
{
struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
- if (b) {
- b->bi_destructor = bio_fs_destructor;
- __bio_clone(b, bio);
+ if (!b)
+ return NULL;
+
+ b->bi_destructor = bio_fs_destructor;
+ __bio_clone(b, bio);
+
+ if (bio_integrity(bio)) {
+ int ret;
+
+ ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+ if (ret < 0)
+ return NULL;
}
return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
prev->bv_len += len;
- if (q->merge_bvec_fn &&
- q->merge_bvec_fn(q, bio, prev) < len) {
- prev->bv_len -= len;
- return 0;
+
+ if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+ prev->bv_len -= len;
+ return 0;
+ }
}
goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* queue to get further control
*/
if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
/*
* merge_bvec_fn() returns number of bytes it can accept
* at this offset
*/
- if (q->merge_bvec_fn(q, bio, bvec) < len) {
+ if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
bvec->bv_page = NULL;
bvec->bv_len = 0;
bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
bp->bio1.bi_private = bi;
bp->bio2.bi_private = pool;
+ if (bio_integrity(bi))
+ bio_integrity_split(bi, bp, first_sectors);
+
return bp;
}
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
+ bioset_integrity_free(bs);
biovec_free_pools(bs);
kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
if (!bs->bio_pool)
goto bad;
+ if (bioset_integrity_create(bs, bio_pool_size))
+ goto bad;
+
if (!biovec_create_pools(bs, bvec_pool_size))
return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
{
bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ bio_integrity_init_slab();
biovec_init_slabs();
fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c26..d48caee12e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
void invalidate_bh_lrus(void)
{
- on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+ on_each_cpu(invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b8845..3cb7cda3d78 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
return -ENXIO;
new = container_of(kobj, struct cdev, kobj);
spin_lock(&cdev_lock);
+ /* Check i_cdev again in case somebody beat us to it while
+ we dropped the lock. */
p = inode->i_cdev;
if (!p) {
inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
cdev_put(p);
return -ENXIO;
}
- if (filp->f_op->open) {
- lock_kernel();
+ if (filp->f_op->open)
ret = filp->f_op->open(inode,filp);
- unlock_kernel();
- }
if (ret)
cdev_put(p);
return ret;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff540..0e9fc2ba90e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+ {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+ {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405a..22857c639df 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(file, offset, origin);
+ return generic_file_llseek_unlocked(file, offset, origin);
}
struct file_system_type cifs_fs_type = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543cee..2e904bd111c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc) {
- if (rc == -EREMOTE && !is_dfs_referral) {
- is_dfs_referral = true;
- cFYI(DBG2, ("DFS ref"));
- /* for DFS, server does not give us real inode data */
- fill_fake_finddataunix(&find_data, sb);
- rc = 0;
- }
- }
+ if (rc == -EREMOTE && !is_dfs_referral) {
+ is_dfs_referral = true;
+ cFYI(DBG2, ("DFS ref"));
+ /* for DFS, server does not give us real inode data */
+ fill_fake_finddataunix(&find_data, sb);
+ rc = 0;
+ } else if (rc)
+ goto cgiiu_exit;
+
num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
end_of_file = le64_to_cpu(find_data.EndOfFile);
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
*pinode = new_inode(sb);
if (*pinode == NULL) {
rc = -ENOMEM;
- goto cgiiu_exit;
+ goto cgiiu_exit;
}
/* Is an i_ino of zero legal? */
/* note ino incremented to unique num in new_inode */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 05ec7eef869..ddefb8851a9 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -68,9 +68,11 @@
#include <linux/capi.h>
#include <linux/gigaset_dev.h>
+#ifdef CONFIG_BLOCK
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/sg.h>
+#endif
#include <asm/uaccess.h>
#include <linux/ethtool.h>
@@ -1965,6 +1967,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
COMPATIBLE_IOCTL(PIO_FONTRESET)
COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -1974,6 +1977,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
/* Big T */
COMPATIBLE_IOCTL(TUNSETNOCSUM)
COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2044,6 +2048,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
COMPATIBLE_IOCTL(SIOCSIFVLAN)
COMPATIBLE_IOCTL(SIOCBRADDBR)
COMPATIBLE_IOCTL(SIOCBRDELBR)
+#ifdef CONFIG_BLOCK
/* SG stuff */
COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2068,6 +2073,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
/* PPP stuff */
COMPATIBLE_IOCTL(PPPIOCGFLAGS)
COMPATIBLE_IOCTL(PPPIOCSFLAGS)
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index cca98609aa7..da015c12e3e 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
struct configfs_dirent {
atomic_t s_count;
@@ -47,8 +48,11 @@ struct configfs_dirent {
#define CONFIGFS_USET_DIR 0x0040
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
+#define CONFIGFS_USET_IN_MKDIR 0x0200
#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
+extern spinlock_t configfs_dirent_lock;
+
extern struct vfsmount * configfs_mount;
extern struct kmem_cache *configfs_dir_cachep;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a48dc7dd876..0e64312a084 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -30,11 +30,25 @@
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/err.h>
#include <linux/configfs.h>
#include "configfs_internal.h"
DECLARE_RWSEM(configfs_rename_sem);
+/*
+ * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Also protects mutations of symlinks linkage to target configfs_dirent
+ * Mutators of configfs_dirent linkage must *both* have the proper inode locked
+ * and configfs_dirent_lock locked, in that order.
+ * This allows one to safely traverse configfs_dirent trees and symlinks without
+ * having to lock inodes.
+ *
+ * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
+ * unlocked is not reliable unless in detach_groups() called from
+ * rmdir()/unregister() and from configfs_attach_group()
+ */
+DEFINE_SPINLOCK(configfs_dirent_lock);
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
@@ -74,13 +88,20 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
if (!sd)
- return NULL;
+ return ERR_PTR(-ENOMEM);
atomic_set(&sd->s_count, 1);
INIT_LIST_HEAD(&sd->s_links);
INIT_LIST_HEAD(&sd->s_children);
- list_add(&sd->s_sibling, &parent_sd->s_children);
sd->s_element = element;
+ spin_lock(&configfs_dirent_lock);
+ if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
+ spin_unlock(&configfs_dirent_lock);
+ kmem_cache_free(configfs_dir_cachep, sd);
+ return ERR_PTR(-ENOENT);
+ }
+ list_add(&sd->s_sibling, &parent_sd->s_children);
+ spin_unlock(&configfs_dirent_lock);
return sd;
}
@@ -118,8 +139,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
struct configfs_dirent * sd;
sd = configfs_new_dirent(parent_sd, element);
- if (!sd)
- return -ENOMEM;
+ if (IS_ERR(sd))
+ return PTR_ERR(sd);
sd->s_mode = mode;
sd->s_type = type;
@@ -173,7 +194,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
} else {
struct configfs_dirent *sd = d->d_fsdata;
if (sd) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
}
}
@@ -224,7 +247,9 @@ int configfs_create_link(struct configfs_symlink *sl,
else {
struct configfs_dirent *sd = dentry->d_fsdata;
if (sd) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
}
}
@@ -238,7 +263,9 @@ static void remove_dir(struct dentry * d)
struct configfs_dirent * sd;
sd = d->d_fsdata;
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
@@ -331,13 +358,13 @@ static struct dentry * configfs_lookup(struct inode *dir,
/*
* Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
- * attributes and are removed by rmdir(). We recurse, taking i_mutex
- * on all children that are candidates for default detach. If the
- * result is clean, then configfs_detach_group() will handle dropping
- * i_mutex. If there is an error, the caller will clean up the i_mutex
- * holders via configfs_detach_rollback().
+ * attributes and are removed by rmdir(). We recurse, setting
+ * CONFIGFS_USET_DROPPING on all children that are candidates for
+ * default detach.
+ * If there is an error, the caller will reset the flags via
+ * configfs_detach_rollback().
*/
-static int configfs_detach_prep(struct dentry *dentry)
+static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
{
struct configfs_dirent *parent_sd = dentry->d_fsdata;
struct configfs_dirent *sd;
@@ -352,15 +379,20 @@ static int configfs_detach_prep(struct dentry *dentry)
if (sd->s_type & CONFIGFS_NOT_PINNED)
continue;
if (sd->s_type & CONFIGFS_USET_DEFAULT) {
- mutex_lock(&sd->s_dentry->d_inode->i_mutex);
- /* Mark that we've taken i_mutex */
+ /* Abort if racing with mkdir() */
+ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
+ if (wait_mutex)
+ *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
+ return -EAGAIN;
+ }
+ /* Mark that we're trying to drop the group */
sd->s_type |= CONFIGFS_USET_DROPPING;
/*
* Yup, recursive. If there's a problem, blame
* deep nesting of default_groups
*/
- ret = configfs_detach_prep(sd->s_dentry);
+ ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
if (!ret)
continue;
} else
@@ -374,7 +406,7 @@ out:
}
/*
- * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
+ * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
* set.
*/
static void configfs_detach_rollback(struct dentry *dentry)
@@ -385,11 +417,7 @@ static void configfs_detach_rollback(struct dentry *dentry)
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (sd->s_type & CONFIGFS_USET_DEFAULT) {
configfs_detach_rollback(sd->s_dentry);
-
- if (sd->s_type & CONFIGFS_USET_DROPPING) {
- sd->s_type &= ~CONFIGFS_USET_DROPPING;
- mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
- }
+ sd->s_type &= ~CONFIGFS_USET_DROPPING;
}
}
}
@@ -410,7 +438,9 @@ static void detach_attrs(struct config_item * item)
list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
continue;
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_drop_dentry(sd, dentry);
configfs_put(sd);
}
@@ -466,16 +496,12 @@ static void detach_groups(struct config_group *group)
child = sd->s_dentry;
+ mutex_lock(&child->d_inode->i_mutex);
+
configfs_detach_group(sd->s_element);
child->d_inode->i_flags |= S_DEAD;
- /*
- * From rmdir/unregister, a configfs_detach_prep() pass
- * has taken our i_mutex for us. Drop it.
- * From mkdir/register cleanup, there is no sem held.
- */
- if (sd->s_type & CONFIGFS_USET_DROPPING)
- mutex_unlock(&child->d_inode->i_mutex);
+ mutex_unlock(&child->d_inode->i_mutex);
d_delete(child);
dput(child);
@@ -1047,25 +1073,24 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
group = NULL;
item = NULL;
if (type->ct_group_ops->make_group) {
- group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
- if (group) {
+ ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
+ if (!ret) {
link_group(to_config_group(parent_item), group);
item = &group->cg_item;
}
} else {
- item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
- if (item)
+ ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
+ if (!ret)
link_obj(parent_item, item);
}
mutex_unlock(&subsys->su_mutex);
kfree(name);
- if (!item) {
+ if (ret) {
/*
- * If item == NULL, then link_obj() was never called.
+ * If ret != 0, then link_obj() was never called.
* There are no extra references to clean up.
*/
- ret = -ENOMEM;
goto out_put;
}
@@ -1093,11 +1118,26 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
*/
module_got = 1;
+ /*
+ * Make racing rmdir() fail if it did not tag parent with
+ * CONFIGFS_USET_DROPPING
+ * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
+ * fail and let rmdir() terminate correctly
+ */
+ spin_lock(&configfs_dirent_lock);
+ /* This will make configfs_detach_prep() fail */
+ sd->s_type |= CONFIGFS_USET_IN_MKDIR;
+ spin_unlock(&configfs_dirent_lock);
+
if (group)
ret = configfs_attach_group(parent_item, item, dentry);
else
ret = configfs_attach_item(parent_item, item, dentry);
+ spin_lock(&configfs_dirent_lock);
+ sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+ spin_unlock(&configfs_dirent_lock);
+
out_unlink:
if (ret) {
/* Tear down everything we built up */
@@ -1161,12 +1201,27 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EINVAL;
}
- ret = configfs_detach_prep(dentry);
- if (ret) {
- configfs_detach_rollback(dentry);
- config_item_put(parent_item);
- return ret;
- }
+ spin_lock(&configfs_dirent_lock);
+ do {
+ struct mutex *wait_mutex;
+
+ ret = configfs_detach_prep(dentry, &wait_mutex);
+ if (ret) {
+ configfs_detach_rollback(dentry);
+ spin_unlock(&configfs_dirent_lock);
+ if (ret != -EAGAIN) {
+ config_item_put(parent_item);
+ return ret;
+ }
+
+ /* Wait until the racing operation terminates */
+ mutex_lock(wait_mutex);
+ mutex_unlock(wait_mutex);
+
+ spin_lock(&configfs_dirent_lock);
+ }
+ } while (ret == -EAGAIN);
+ spin_unlock(&configfs_dirent_lock);
/* Get a working ref for the duration of this function */
item = configfs_get_config_item(dentry);
@@ -1258,7 +1313,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
file->private_data = configfs_new_dirent(parent_sd, NULL);
mutex_unlock(&dentry->d_inode->i_mutex);
- return file->private_data ? 0 : -ENOMEM;
+ return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
}
@@ -1268,7 +1323,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
struct configfs_dirent * cursor = file->private_data;
mutex_lock(&dentry->d_inode->i_mutex);
+ spin_lock(&configfs_dirent_lock);
list_del_init(&cursor->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
mutex_unlock(&dentry->d_inode->i_mutex);
release_configfs_dirent(cursor);
@@ -1308,7 +1365,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
/* fallthrough */
default:
if (filp->f_pos == 2) {
+ spin_lock(&configfs_dirent_lock);
list_move(q, &parent_sd->s_children);
+ spin_unlock(&configfs_dirent_lock);
}
for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
struct configfs_dirent *next;
@@ -1331,7 +1390,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
dt_type(next)) < 0)
return 0;
+ spin_lock(&configfs_dirent_lock);
list_move(q, p);
+ spin_unlock(&configfs_dirent_lock);
p = q;
filp->f_pos++;
}
@@ -1362,6 +1423,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
struct list_head *p;
loff_t n = file->f_pos - 2;
+ spin_lock(&configfs_dirent_lock);
list_del(&cursor->s_sibling);
p = sd->s_children.next;
while (n && p != &sd->s_children) {
@@ -1373,6 +1435,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
p = p->next;
}
list_add_tail(&cursor->s_sibling, p);
+ spin_unlock(&configfs_dirent_lock);
}
}
mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1448,9 +1511,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
I_MUTEX_PARENT);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
- if (configfs_detach_prep(dentry)) {
+ spin_lock(&configfs_dirent_lock);
+ if (configfs_detach_prep(dentry, NULL)) {
printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
}
+ spin_unlock(&configfs_dirent_lock);
configfs_detach_group(&group->cg_item);
dentry->d_inode->i_flags |= S_DEAD;
mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index b9a1d810346..4803ccc9448 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
if (!sd->s_element)
continue;
if (!strcmp(configfs_get_name(sd), name)) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_drop_dentry(sd, dir);
configfs_put(sd);
break;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 2a731ef5f30..0004d18c40a 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -77,12 +77,15 @@ static int create_link(struct config_item *parent_item,
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
if (sl) {
sl->sl_target = config_item_get(item);
- /* FIXME: needs a lock, I'd bet */
+ spin_lock(&configfs_dirent_lock);
list_add(&sl->sl_list, &target_sd->s_links);
+ spin_unlock(&configfs_dirent_lock);
ret = configfs_create_link(sl, parent_item->ci_dentry,
dentry);
if (ret) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sl->sl_list);
+ spin_unlock(&configfs_dirent_lock);
config_item_put(item);
kfree(sl);
}
@@ -137,8 +140,12 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
goto out_put;
ret = type->ct_item_ops->allow_link(parent_item, target_item);
- if (!ret)
+ if (!ret) {
ret = create_link(parent_item, target_item, dentry);
+ if (ret && type->ct_item_ops->drop_link)
+ type->ct_item_ops->drop_link(parent_item,
+ target_item);
+ }
config_item_put(target_item);
path_put(&nd.path);
@@ -169,7 +176,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
parent_item = configfs_get_config_item(dentry->d_parent);
type = parent_item->ci_type;
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_drop_dentry(sd, dentry->d_parent);
dput(dentry);
configfs_put(sd);
@@ -184,8 +193,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
type->ct_item_ops->drop_link(parent_item,
sl->sl_target);
- /* FIXME: Needs lock */
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sl->sl_list);
+ spin_unlock(&configfs_dirent_lock);
/* Put reference from create_link() */
config_item_put(sl->sl_target);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd288b..492d8caaaf2 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -41,16 +41,20 @@ struct comm;
struct nodes;
struct node;
-static struct config_group *make_cluster(struct config_group *, const char *);
+static int make_cluster(struct config_group *, const char *,
+ struct config_group **);
static void drop_cluster(struct config_group *, struct config_item *);
static void release_cluster(struct config_item *);
-static struct config_group *make_space(struct config_group *, const char *);
+static int make_space(struct config_group *, const char *,
+ struct config_group **);
static void drop_space(struct config_group *, struct config_item *);
static void release_space(struct config_item *);
-static struct config_item *make_comm(struct config_group *, const char *);
+static int make_comm(struct config_group *, const char *,
+ struct config_item **);
static void drop_comm(struct config_group *, struct config_item *);
static void release_comm(struct config_item *);
-static struct config_item *make_node(struct config_group *, const char *);
+static int make_node(struct config_group *, const char *,
+ struct config_item **);
static void drop_node(struct config_group *, struct config_item *);
static void release_node(struct config_item *);
@@ -392,8 +396,8 @@ static struct node *to_node(struct config_item *i)
return i ? container_of(i, struct node, item) : NULL;
}
-static struct config_group *make_cluster(struct config_group *g,
- const char *name)
+static int make_cluster(struct config_group *g, const char *name,
+ struct config_group **new_g)
{
struct cluster *cl = NULL;
struct spaces *sps = NULL;
@@ -431,14 +435,15 @@ static struct config_group *make_cluster(struct config_group *g,
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
- return &cl->group;
+ *new_g = &cl->group;
+ return 0;
fail:
kfree(cl);
kfree(gps);
kfree(sps);
kfree(cms);
- return NULL;
+ return -ENOMEM;
}
static void drop_cluster(struct config_group *g, struct config_item *i)
@@ -466,7 +471,8 @@ static void release_cluster(struct config_item *i)
kfree(cl);
}
-static struct config_group *make_space(struct config_group *g, const char *name)
+static int make_space(struct config_group *g, const char *name,
+ struct config_group **new_g)
{
struct space *sp = NULL;
struct nodes *nds = NULL;
@@ -489,13 +495,14 @@ static struct config_group *make_space(struct config_group *g, const char *name)
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
sp->members_count = 0;
- return &sp->group;
+ *new_g = &sp->group;
+ return 0;
fail:
kfree(sp);
kfree(gps);
kfree(nds);
- return NULL;
+ return -ENOMEM;
}
static void drop_space(struct config_group *g, struct config_item *i)
@@ -522,19 +529,21 @@ static void release_space(struct config_item *i)
kfree(sp);
}
-static struct config_item *make_comm(struct config_group *g, const char *name)
+static int make_comm(struct config_group *g, const char *name,
+ struct config_item **new_i)
{
struct comm *cm;
cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
if (!cm)
- return NULL;
+ return -ENOMEM;
config_item_init_type_name(&cm->item, name, &comm_type);
cm->nodeid = -1;
cm->local = 0;
cm->addr_count = 0;
- return &cm->item;
+ *new_i = &cm->item;
+ return 0;
}
static void drop_comm(struct config_group *g, struct config_item *i)
@@ -554,14 +563,15 @@ static void release_comm(struct config_item *i)
kfree(cm);
}
-static struct config_item *make_node(struct config_group *g, const char *name)
+static int make_node(struct config_group *g, const char *name,
+ struct config_item **new_i)
{
struct space *sp = to_space(g->cg_item.ci_parent);
struct node *nd;
nd = kzalloc(sizeof(struct node), GFP_KERNEL);
if (!nd)
- return NULL;
+ return -ENOMEM;
config_item_init_type_name(&nd->item, name, &node_type);
nd->nodeid = -1;
@@ -573,7 +583,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
sp->members_count++;
mutex_unlock(&sp->members_lock);
- return &nd->item;
+ *new_i = &nd->item;
+ return 0;
}
static void drop_node(struct config_group *g, struct config_item *i)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33..f976f303c19 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
#include <linux/poll.h>
#include <linux/signal.h>
#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
#include <linux/dlm.h>
#include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
struct dlm_user_proc *proc;
struct dlm_ls *ls;
+ lock_kernel();
ls = dlm_find_lockspace_device(iminor(inode));
- if (!ls)
+ if (!ls) {
+ unlock_kernel();
return -ENOENT;
+ }
proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
if (!proc) {
dlm_put_lockspace(ls);
+ unlock_kernel();
return -ENOMEM;
}
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
spin_lock_init(&proc->locks_spin);
init_waitqueue_head(&proc->wait);
file->private_data = proc;
+ unlock_kernel();
return 0;
}
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
static int ctl_device_open(struct inode *inode, struct file *file)
{
+ cycle_kernel_lock();
file->private_data = NULL;
return 0;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a..24749bf0668 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
#include "ecryptfs_kernel.h"
/**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
int rc = 0;
struct file *lower_file = NULL;
+ lock_kernel();
lower_file = ecryptfs_file_to_lower(file);
if (lower_file->f_op && lower_file->f_op->fasync)
rc = lower_file->f_op->fasync(fd, lower_file, flag);
+ unlock_kernel();
return rc;
}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 50c994a249a..09a4522f65e 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
int rc;
atomic_set(&ecryptfs_num_miscdev_opens, 0);
- mutex_lock(&ecryptfs_daemon_hash_mux);
rc = misc_register(&ecryptfs_miscdev);
if (rc)
printk(KERN_ERR "%s: Failed to register miscellaneous device "
"for communications with userspace daemons; rc = [%d]\n",
__func__, rc);
- mutex_unlock(&ecryptfs_daemon_hash_mux);
return rc;
}
diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df..fd9234379e8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -610,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
bprm->exec -= stack_shift;
down_write(&mm->mmap_sem);
- vm_flags = vma->vm_flags;
+ vm_flags = VM_STACK_FLAGS;
/*
* Adjust stack execute permissions; explicitly enable for
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ad..2845425077e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off+len-towrite) {
i_size_write(inode, off+len-towrite);
EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d..495ab21b983 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -305,7 +300,7 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
/**
* ext4_has_free_blocks()
- * @sbi: in-core super block structure.
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks, root_blocks;
+ ext4_fsblk_t free_blocks;
+ ext4_fsblk_t root_blocks = 0;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+ if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ if (free_blocks - root_blocks < FBC_BATCH)
+ free_blocks =
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+ if (free_blocks - root_blocks < nblocks)
+ return free_blocks - root_blocks;
+ return nblocks;
+ }
+
/**
* ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_group_t ngroups;
unsigned long num = *count;
- *errp = -ENOSPC;
sb = inode->i_sb;
if (!sb) {
+ *errp = -ENODEV;
printk("ext4_new_block: nonexistent device");
return 0;
}
+ sbi = EXT4_SB(sb);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ *count = ext4_has_free_blocks(sbi, *count);
+ }
+ if (*count == 0) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
+ num = *count;
+
/*
* Check quota for allocation of this block.
*/
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
/*
* First, test whether the goal block is free.
*/
@@ -1734,7 +1759,7 @@ retry_alloc:
my_rsv = NULL;
if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
continue;
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
/*
@@ -1882,7 +1907,15 @@ allocated:
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
return 0;
}
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea19..d3d23d73c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac..303e41cf7b1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -45,7 +45,7 @@
#define ext4_debug(f, a...) \
do { \
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
+ __FILE__, __LINE__, __func__); \
printk (KERN_DEBUG f, ## a); \
} while (0)
#else
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks);
extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+ ext4_grpblk_t add);
/* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode (struct inode *, int);
extern int ext4_setattr (struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
extern void ext4_delete_inode (struct inode *);
extern int ext4_sync_inode (handle_t *, struct inode *);
extern void ext4_discard_reservation (struct inode *);
extern void ext4_dirty_inode(struct inode *);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate (struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
+ __ext4_std_error((sb), __func__, (errno)); \
} while (0)
/*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned long max_blocks,
struct buffer_head *bh, int create,
- int extend_disksize);
+ int extend_disksize, int flag);
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fa..6c166c0a54b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d7..ef7409f0e7e 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
};
/*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
+ struct jbd2_inode jinode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+
+ /* allocation reservation info for delalloc */
+ unsigned long i_reserved_data_blocks;
+ unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};
#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b2..eb8bc3afe6e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+ __ext4_journal_forget(__func__, (handle), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
}
#define ext4_journal_stop(handle) \
- __ext4_journal_stop(__FUNCTION__, (handle))
+ __ext4_journal_stop(__func__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal);
}
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f219..6300226d553 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
#include <linux/rbtree.h>
/*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
};
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3da..42c4c0c892e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
-
- return handle;
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err)
+ return err;
+ return ext4_journal_restart(handle, needed);
}
/*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
return bg_start + colour + block;
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+ int newextents = blocks;
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ /* number of new leaf blocks needed */
+ num = leafs = (newextents + lcap - 1) / lcap;
+
+ /*
+ * Worse case, we need separate index block(s)
+ * to link all new leaf blocks
+ */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;
@@ -981,6 +1017,8 @@ repeat:
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
#endif
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_journal_restart(handle, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
+ loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (allocated > max_blocks)
allocated = max_blocks;
- /* mark the buffer unwritten */
- __set_bit(BH_Unwritten, &bh_result->b_state);
+ set_buffer_unwritten(bh_result);
goto out2;
}
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (extend_disksize) {
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ }
+
+ set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
+ set_buffer_mapped(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
@@ -2744,7 +2785,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
*/
err = ext4_writepage_trans_blocks(inode) + 3;
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
handle->h_sync = 1;
out_stop:
+ up_write(&EXT4_I(inode)->i_data_sem);
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- up_write(&EXT4_I(inode)->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
}
ret = ext4_get_blocks_wrap(handle, inode, block,
max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366a..430eb7978db 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
return ret;
}
+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr,
+ .getattr = ext4_getattr,
#ifdef CONFIG_EXT4DEV_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8..a45c3737ad3 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
+#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -45,6 +46,7 @@
int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
}
out:
return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7ee..c2c0a8d06d0 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c80..a92eb305344 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+ ext4_group_t *best_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = parent_fbg_group;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flexbg_free_blocks;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flexbg_free_blocks = flex_group[i].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ goto found_flexbg;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+ if (!flex_group[best_flex].free_inodes ||
+ !flex_group[best_flex].free_blocks)
+ return -1;
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ *best_group = i;
+ goto out;
+ }
+ }
+
+ return -1;
+out:
+ return 0;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -600,7 +689,7 @@ got:
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
BUFFER_TRACE(block_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
goto fail_free_drop;
if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for diretory, file and normal symlink*/
+ /* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail_free_drop;
}
}
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d970774641..8ca2763df09 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
{
handle_t *handle;
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
int target, i;
- unsigned long count = 0;
+ unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
- target = blks + indirect_blks;
-
- while (1) {
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ current_block = ext4_new_meta_blocks(handle, inode,
+ goal, &count, err);
if (*err)
goto failed_out;
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
new_blocks[index++] = current_block++;
count--;
}
-
- if (count > 0)
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
break;
+ }
}
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ count = target;
+ /* allocating blocks for data blocks */
+ current_block = ext4_new_blocks(handle, inode, iblock,
+ goal, &count, err);
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += count;
+ }
+allocated:
/* total number of blocks allocated for direct blocks */
- ret = count;
+ ret = blk_allocated;
*err = 0;
return ret;
failed_out:
@@ -584,8 +631,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
+ loff_t disksize;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* protect it if you're about to implement concurrent
* ext4_get_block() -bzzz
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!err && extend_disksize) {
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > ei->i_disksize)
+ ei->i_disksize = disksize;
+ }
if (err)
goto cleanup;
@@ -934,7 +989,7 @@ out:
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+ int create, int extend_disksize, int flag)
{
int retval;
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flag)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
~EXT4_EXT_MIGRATE;
}
}
+
+ if (flag) {
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ /*
+ * Update reserved blocks/metadata blocks
+ * after successful block allocation
+ * which were deferred till now
+ */
+ if ((retval > 0) && buffer_delay(bh))
+ ext4_da_release_space(inode, retval, 0);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
}
ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create, 0, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create, 1, 0);
/*
* ext4_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
-
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
ret = PTR_ERR(handle);
goto out;
}
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);
@@ -1225,8 +1302,8 @@ retry:
}
if (ret) {
- ext4_journal_stop(handle);
unlock_page(page);
+ ext4_journal_stop(handle);
page_cache_release(page);
}
@@ -1236,15 +1313,6 @@ out:
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
unsigned from, to;
int ret = 0, ret2;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
ret = ret2;
}
+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);
return ret ? ret : copied;
}
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ind_blks, dind_blks, tind_blks;
+
+ /* number of new indirect blocks needed */
+ ind_blks = (blocks + icap - 1) / icap;
+
+ dind_blks = (ind_blks + icap - 1) / icap;
+
+ tind_blks = 1;
+
+ return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_calc_metadata_amount(inode, blocks);
+
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+ mdblocks = ext4_calc_metadata_amount(inode, total);
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+ total = md_needed + nrblocks;
+
+ if (ext4_has_free_blocks(sbi, total) < total) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return -ENOSPC;
+ }
+
+ /* reduce fs free blocks counter */
+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free, release;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ release = to_free + mdb_free;
+
+ /* update fs free blocks counter for truncate case */
+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+ /* update per-inode reservations */
+ BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+ ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct buffer_head lbh; /* extent of blocks */
+ unsigned long first_page, next_page; /* extent of pages */
+ get_block_t *get_block;
+ struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ struct mpage_data mpd_pp = {
+ .bio = NULL,
+ .last_block_in_bio = 0,
+ .get_block = mpd->get_block,
+ .use_writepage = 1,
+ };
+ int ret = 0, err, nr_pages, i;
+ unsigned long index, end;
+ struct pagevec pvec;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+
+ pagevec_init(&pvec, 0);
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ * XXX: unlock and re-dirty them?
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ if (mpd_pp.bio)
+ mpage_bio_submit(WRITE, mpd_pp.bio);
+
+ return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+ struct buffer_head *exbh)
+{
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ int blocks = exbh->b_size >> inode->i_blkbits;
+ sector_t pblock = exbh->b_blocknr, cur_logical;
+ struct buffer_head *head, *bh;
+ unsigned long index, end;
+ struct pagevec pvec;
+ int nr_pages, i;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(!page_has_buffers(page));
+
+ bh = page_buffers(page);
+ head = bh;
+
+ /* skip blocks out of the range */
+ do {
+ if (cur_logical >= logical)
+ break;
+ cur_logical++;
+ } while ((bh = bh->b_this_page) != head);
+
+ do {
+ if (cur_logical >= logical + blocks)
+ break;
+ if (buffer_delay(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_delay(bh);
+ } else if (buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+
+ cur_logical++;
+ pblock++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int blocks, i;
+
+ blocks = bh->b_size >> inode->i_blkbits;
+ for (i = 0; i < blocks; i++)
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ int err = 0, remain = lbh->b_size;
+ sector_t next = lbh->b_blocknr;
+ struct buffer_head new;
+
+ /*
+ * We consider only non-mapped and non-allocated blocks
+ */
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ return;
+
+ while (remain) {
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = remain;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * Rather than implement own error handling
+ * here, we just leave remaining blocks
+ * unallocated and try again with ->writepage()
+ */
+ break;
+ }
+ BUG_ON(new.b_size == 0);
+
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
+
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
+
+ /* go for the remaining blocks */
+ next += new.b_size >> mpd->inode->i_blkbits;
+ remain -= new.b_size;
+ }
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, struct buffer_head *bh)
+{
+ struct buffer_head *lbh = &mpd->lbh;
+ sector_t next;
+
+ next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+ /*
+ * First block in the extent
+ */
+ if (lbh->b_size == 0) {
+ lbh->b_blocknr = logical;
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ return;
+ }
+
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+ lbh->b_size += bh->b_size;
+ return;
+ }
+
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ mpage_da_map_blocks(mpd);
+
+ /*
+ * Now start a new extent
+ */
+ lbh->b_size = bh->b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct mpage_da_data *mpd = data;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *bh, *head, fake;
+ sector_t logical;
+
+ /*
+ * Can we merge this page to current extent?
+ */
+ if (mpd->next_page != page->index) {
+ /*
+ * Nope, we can't. So, we map non-allocated blocks
+ * and start IO on them using __mpage_writepage()
+ */
+ if (mpd->next_page != mpd->first_page) {
+ mpage_da_map_blocks(mpd);
+ mpage_da_submit_io(mpd);
+ }
+
+ /*
+ * Start next extent of pages ...
+ */
+ mpd->first_page = page->index;
+
+ /*
+ * ... and blocks
+ */
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ }
+
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ /*
+ * There is no attached buffer heads yet (mmap?)
+ * we treat the page asfull of dirty blocks
+ */
+ bh = &fake;
+ bh->b_size = PAGE_CACHE_SIZE;
+ bh->b_state = 0;
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ } else {
+ /*
+ * Page with regular buffer heads, just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ if (buffer_dirty(bh))
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ get_block_t get_block)
+{
+ struct mpage_da_data mpd;
+ int ret;
+
+ if (!get_block)
+ return generic_writepages(mapping, wbc);
+
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+ mpd.lbh.b_size = 0;
+ mpd.lbh.b_state = 0;
+ mpd.lbh.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.get_block = get_block;
+
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+ /*
+ * Handle last extent of pages
+ */
+ if (mpd.next_page != mpd.first_page) {
+ mpage_da_map_blocks(&mpd);
+ mpage_da_submit_io(&mpd);
+ }
+
+ return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ if ((ret == 0) && !buffer_delay(bh_result)) {
+ /* the block isn't (pre)allocated yet, let's reserve space */
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ } else if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+
+ return ret;
+}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ if (!handle) {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ BUG_ON(!ret);
+ } else {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ }
+
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ if (EXT4_I(inode)->i_disksize == disksize) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation
+ */
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ /*
+ * we don't want to do block allocation in writepage
+ * so call get_block_wrap with create = 0
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
+ */
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ handle_t *handle = NULL;
+ int needed_blocks;
+ int ret = 0;
+ long to_write;
+ loff_t range_start = 0;
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages)
+ return 0;
+
+ /*
+ * Estimate the worse case needed credits to write out
+ * EXT4_MAX_BUF_BLOCKS pages
+ */
+ needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+ to_write = wbc->nr_to_write;
+ if (!wbc->range_cyclic) {
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+ range_start = wbc->range_start;
+ }
+
+ while (!ret && to_write) {
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_writepages;
+ }
+ if (ext4_should_order_data(inode)) {
+ /*
+ * With ordered mode we need to add
+ * the inode to the journal handle
+ * when we do block allocation.
+ */
+ ret = ext4_jbd2_file_inode(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out_writepages;
+ }
+
+ }
+ /*
+ * set the max dirty pages could be write at a time
+ * to fit into the reserved transaction credits
+ */
+ if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+ wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+ to_write -= wbc->nr_to_write;
+ ret = mpage_da_writepages(mapping, wbc,
+ ext4_da_get_block_write);
+ ext4_journal_stop(handle);
+ if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write;
+ if (range_start)
+ wbc->range_start = range_start;
+ return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i=0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied -1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
*
* Problem:
*
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
-
- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page,
+ ext4_normal_get_block_write, wbc);
+ else
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+}
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+static int ext4_normal_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
-
- ret = block_write_full_page(page, ext4_get_block, wbc);
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
+ if (!ext4_journal_current_handle())
+ return __ext4_normal_writepage(page, wbc);
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;
-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (ext4_journal_current_handle())
- goto no_write;
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
+ }
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ if (ext4_journal_current_handle())
goto no_write;
- }
- if (!page_has_buffers(page) || PageChecked(page)) {
+ if (PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_ordered_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
static const struct address_space_operations ext4_writeback_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
+ .writepage = ext4_normal_writepage,
.sync_page = block_sync_page,
.write_begin = ext4_write_begin,
.write_end = ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
.releasepage = ext4_releasepage,
};
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
err = ext4_journal_dirty_metadata(handle, bh);
} else {
if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext4_journal_dirty_metadata(handle, this_bh);
+ else
+ ext4_error(inode->i_sb, __func__,
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext4_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!ext4_can_truncate(inode))
return;
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
goto out_stop;
/*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+ /*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes, so it is now safe to propagate
* the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
*/
ei->i_disksize = inode->i_size;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error)
error = rc;
ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
}
rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
/*
* How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ }
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+ * inode.i__mutex here. That allow * parallel write_begin,
+ * write_end call. lock_page prevent this from happening
+ * on the same page though
+ */
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+ len, len, page, NULL);
+ if (ret < 0)
+ goto out_unlock;
+ ret = 0;
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade15..8d141a25bbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (!buffer_uptodate(bh[i]))
goto out;
+ err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
int pnum;
int poff;
struct page *page;
+ int ret;
mb_debug("load group %lu\n", group);
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
{
int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+ ext4_unlock_group(sb, e4b->bd_group);
ext4_error(sb, __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %lu)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
+ ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
} while (1);
}
mb_check_buddy(e4b);
-
- return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
int rc;
int size;
+ if (unlikely(sbi->s_mb_history == NULL))
+ return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
/* if we can't allocate history, then we simple won't use it */
}
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i, len;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+ "buddy group\n");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ /*
+ * calculate needed size. if change bb_counters size,
+ * don't forget about ext4_mb_generate_buddy()
+ */
+ len = offsetof(typeof(**meta_group_info),
+ bb_counters[sb->s_blocksize_bits + 2]);
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_blocks_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+ int err;
+
+ /* Add group based on group descriptor*/
+ err = ext4_mb_add_groupinfo(sb, group, desc);
+ if (err)
+ return err;
+
+ /*
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+ * datas) are set not up to date so that they will be re-initilaized
+ * during the next call to ext4_mb_load_buddy
+ */
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+ grp->bb_free += add;
+}
+
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t i;
- int j, len, metalen;
+ int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- goto err_freebuddy;
- }
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
"EXT4-fs: can't read descriptor %lu\n", i);
- i++;
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned i;
unsigned offset;
unsigned max;
+ int ret;
if (!test_opt(sb, MBALLOC))
return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0) {
clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- return i;
+ return ret;
}
spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
+#define MB_PROC_FOPS(name) \
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
-}
-
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
+ struct ext4_sb_info *sbi = m->private; \
+ \
+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
+ return 0; \
+} \
+ \
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
+{ \
+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+} \
+ \
+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
+ const char __user *buf, size_t cnt, loff_t *ppos) \
{ \
- struct ext4_sb_info *sbi = data; \
+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
char str[32]; \
long value; \
if (cnt >= sizeof(str)) \
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \
return -ERANGE; \
sbi->s_mb_##name = value; \
return cnt; \
-}
+} \
+ \
+static const struct file_operations ext4_mb_##name##_proc_fops = { \
+ .owner = THIS_MODULE, \
+ .open = ext4_mb_##name##_proc_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+ .write = ext4_mb_##name##_proc_write, \
+};
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
#define MB_PROC_HANDLER(name, var) \
do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
+ &ext4_mb_##var##_proc_fops, sbi); \
if (proc == NULL) { \
printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
goto err_out; \
} \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
} while (0)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
struct proc_dir_entry *proc;
char devname[64];
+ if (proc_root_ext4 == NULL) {
+ sbi->s_mb_proc = NULL;
+ return -EINVAL;
+ }
bdevname(sb->s_bdev, devname);
sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+ /*
+ * free blocks account has already be reduced/reserved
+ * at write_begin() time for delayed allocation
+ * do not double accounting
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ percpu_counter_sub(&sbi->s_freeblocks_counter,
+ ac->ac_b_ex.fe_len);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
if (err)
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -3743,7 +3905,7 @@ repeat:
err = ext4_mb_load_buddy(sb, group, &e4b);
BUG_ON(err != 0); /* error handling here */
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
/* error handling here */
ext4_mb_release_desc(&e4b);
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sbi = EXT4_SB(sb);
if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
&(ar->len), errp);
return block;
}
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ ar->len = ext4_has_free_blocks(sbi, ar->len);
+ }
+
+ if (ar->len == 0) {
+ *errp = -ENOSPC;
+ return 0;
+ }
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
inquota = ar->len;
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
- goto out;
+ goto out2;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
@@ -4085,11 +4261,12 @@ repeat:
ext4_mb_release_context(ac);
-out:
+out2:
+ kmem_cache_free(ext4_ac_cachep, ac);
+out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,7 +4419,7 @@ do_more:
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4309,10 +4486,9 @@ do_more:
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
} else {
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
+ mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
}
spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4497,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
ext4_mb_release_desc(&e4b);
*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830..387ad98350c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext4_next_entry(de)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext4_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
brelse (bh);
/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c0423..f000fbe2cd9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ if (test_opt(sb, MBALLOC)) {
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ if (err)
+ goto exit_journal;
+ }
+ /*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
* all of its blocks and inodes are already valid.
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
handle_t *handle;
int err;
unsigned long freed_blocks;
+ ext4_group_t group;
+ struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
ext4_warning(sb, __func__,
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
+
+ /*
+ * Mark mballoc pages as not up to date so that they will be updated
+ * next time they are loaded by ext4_mb_load_buddy.
+ */
+ if (test_opt(sb, MBALLOC)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Get the info on the last group */
+ grp = ext4_get_group_info(sb, group);
+
+ /* Update free blocks in group info */
+ ext4_mb_update_group_info(grp, add);
+ }
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb96f127c36..1cb371dcd60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_block_alloc_info = NULL;
if (unlikely(rsv))
kfree(rsv);
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode);
}
static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
unsigned long def_mount_opts;
struct super_block *sb = vfs->mnt_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- journal_t *journal = sbi->s_journal;
struct ext4_super_block *es = sbi->s_es;
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nomballoc");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -894,7 +906,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
};
static match_table_t tokens = {
@@ -953,6 +965,8 @@ static match_table_t tokens = {
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
{Opt_err, NULL},
};
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
+ ext4_fsblk_t last_block;
if (!options)
return 1;
@@ -1309,15 +1324,39 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_warning(sb, __func__,
+ "extents feature not enabled "
+ "on this filesystem, use tune2fs\n");
+ return 0;
+ }
set_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_noextents:
+ /*
+ * When e2fsprogs support resizing an already existing
+ * ext3 file system to greater than 2**32 we need to
+ * add support to block allocator to handle growing
+ * already existing block mapped inode so that blocks
+ * allocated for them fall within 2**32
+ */
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ if (last_block > 0xffffffffULL) {
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
+ "large to mount with "
+ "-o noextents options\n");
+ return 0;
+ }
clear_opt (sbi->s_mount_opt, EXTENTS);
break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
break;
+ case Opt_nodelalloc:
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ break;
case Opt_mballoc:
set_opt(sbi->s_mount_opt, MBALLOC);
break;
@@ -1331,6 +1370,9 @@ set_qf_format:
return 0;
sbi->s_stripe = option;
break;
+ case Opt_delalloc:
+ set_opt(sbi->s_mount_opt, DELALLOC);
+ break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ int groups_per_flex = 0;
+ __u64 block_bitmap = 0;
+ int i;
+
+ if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
}
static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
{
struct buffer_head * bh;
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto out_fail;
}
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
- goto out_fail;
- }
-
/*
* The ext4 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
/*
* turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * only if feature flag already set by mkfs or tune2fs.
+ * Use -o noextents to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ set_opt(sbi->s_mount_opt, EXTENTS);
+ else
+ ext4_warning(sb, __func__,
+ "extents feature not enabled on this filesystem, "
+ "use tune2fs.\n");
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * turn on mballoc code by default in ext4 filesystem
+ * Use -o nomballoc to turn it off
*/
set_opt(sbi->s_mount_opt, MBALLOC);
+ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
+ set_opt(sbi->s_mount_opt, DELALLOC);
+
+
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize "
+ "flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
ext4_ext_init(sb);
ext4_mb_init(sb, needs_recovery);
@@ -2372,6 +2485,7 @@ cantfind_ext4:
failed_mount4:
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh);
else {
/* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
brelse(bh);
@@ -3337,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off+len-towrite) {
i_size_write(inode, off+len-towrite);
EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398..93c5fdcdad2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
goal, &error);
if (error)
goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cad..ac1a52cf2a3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4..d91aa61b42a 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af2..3a9ecac8d61 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
{
- return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
}
static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99a..34541d06e62 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
loff_t cpos;
int ret = 0;
- lock_kernel();
+ lock_super(sb);
cpos = filp->f_pos;
/* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
if (unicode)
__putname(unicode);
out:
- unlock_kernel();
+ unlock_super(sb);
return ret;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047..c672df4036e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
- lock_kernel();
fat_free(inode, nr_clusters);
- unlock_kernel();
fat_flush_inodes(inode->i_sb, inode, NULL);
}
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
int error = 0;
unsigned int ia_valid;
- lock_kernel();
-
/*
* Expand the file. Since inode_setattr() updates ->i_size
* before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
error = inode_setattr(inode, attr);
out:
- unlock_kernel();
return error;
}
EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d67..46a4508ffd2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
static void fat_clear_inode(struct inode *inode)
{
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
- lock_kernel();
spin_lock(&sbi->inode_hash_lock);
fat_cache_inval_inode(inode);
hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
spin_unlock(&sbi->inode_hash_lock);
- unlock_kernel();
}
static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
return 0;
- lock_kernel();
+ lock_super(sb);
bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
if (!bh) {
printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
if (i_pos != MSDOS_I(inode)->i_pos) {
spin_unlock(&sbi->inode_hash_lock);
brelse(bh);
- unlock_kernel();
+ unlock_super(sb);
goto retry;
}
@@ -606,7 +605,7 @@ retry:
err = sync_dirty_buffer(bh);
brelse(bh);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
static struct dentry *fat_get_parent(struct dentry *child)
{
+ struct super_block *sb = child->d_sb;
struct buffer_head *bh;
struct msdos_dir_entry *de;
loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
struct inode *inode;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
if (err) {
parent = ERR_PTR(err);
goto out;
}
- inode = fat_build_inode(child->d_sb, de, i_pos);
+ inode = fat_build_inode(sb, de, i_pos);
brelse(bh);
if (IS_ERR(inode)) {
parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
parent = ERR_PTR(-ENOMEM);
}
out:
- unlock_kernel();
+ unlock_super(sb);
return parent;
}
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
long error;
char buf[50];
+ /*
+ * GFP_KERNEL is ok here, because while we do hold the
+ * supeblock lock, memory pressure can't call back into
+ * the filesystem, since we're only just about to mount
+ * it and have no inodes etc active!
+ */
sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a7..330a7d78259 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (error)
return error;
- lock_kernel();
if ((arg ^ filp->f_flags) & FASYNC) {
if (filp->f_op && filp->f_op->fasync) {
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
out:
- unlock_kernel();
return error;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae45f77765c..25adfc3c693 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
* that it can be located for waiting on in __writeback_single_inode().
*
- * Called under inode_lock.
- *
* If `bdi' is non-zero then we're being asked to writeback a specific queue.
* This function assumes that the blockdev superblock's inodes are backed by
* a variety of queues, so all inodes are searched. For other superblocks,
@@ -441,11 +439,12 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* on the writer throttling path, and we get decent balancing between many
* throttled threads: we don't want them all piling up on inode_sync_wait.
*/
-static void
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+void generic_sync_sb_inodes(struct super_block *sb,
+ struct writeback_control *wbc)
{
const unsigned long start = jiffies; /* livelock avoidance */
+ spin_lock(&inode_lock);
if (!wbc->for_kupdate || list_empty(&sb->s_io))
queue_io(sb, wbc->older_than_this);
@@ -524,8 +523,16 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (!list_empty(&sb->s_more_io))
wbc->more_io = 1;
}
+ spin_unlock(&inode_lock);
return; /* Leave any unwritten inodes on s_io */
}
+EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+
+static void sync_sb_inodes(struct super_block *sb,
+ struct writeback_control *wbc)
+{
+ generic_sync_sb_inodes(sb, wbc);
+}
/*
* Start writeback of dirty pagecache data against all unlocked inodes.
@@ -565,11 +572,8 @@ restart:
* be unmounted by the time it is released.
*/
if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root) {
- spin_lock(&inode_lock);
+ if (sb->s_root)
sync_sb_inodes(sb, wbc);
- spin_unlock(&inode_lock);
- }
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
@@ -607,9 +611,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
nr_dirty + nr_unstable;
wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
- spin_lock(&inode_lock);
sync_sb_inodes(sb, &wbc);
- spin_unlock(&inode_lock);
}
/*
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfb..ab2f57e3fb8 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
GFS is perfect consistency -- changes made to the filesystem on one
machine show up immediately on all other machines in the cluster.
- To use the GFS2 filesystem, you will need to enable one or more of
- the below locking modules. Documentation and utilities for GFS2 can
+ To use the GFS2 filesystem in a cluster, you will need to enable
+ the locking module below. Documentation and utilities for GFS2 can
be found here: http://sources.redhat.com/cluster
-config GFS2_FS_LOCKING_NOLOCK
- tristate "GFS2 \"nolock\" locking module"
- depends on GFS2_FS
- help
- Single node locking module for GFS2.
-
- Use this module if you want to use GFS2 on a single node without
- its clustering features. You can still take advantage of the
- large file support, and upgrade to running a full cluster later on
- if required.
-
- If you will only be using GFS2 in cluster mode, you do not need this
- module.
+ The "nolock" lock module is now built in to GFS2 by default.
config GFS2_FS_LOCKING_DLM
tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a0..ec65851ec80 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
ops_fstype.o ops_inode.o ops_super.o quota.o \
recovery.o rgrp.o super.o sys.o trans.o util.o
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b5..ef606e3a5cf 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
};
enum {
- NO_WAIT = 0,
- WAIT = 1,
-};
-
-enum {
NO_FORCE = 0,
FORCE = 1,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5..13391e54661 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
struct hlist_head hb_list;
};
-struct glock_iter {
- int hash; /* hash bucket index */
- struct gfs2_sbd *sdp; /* incore superblock */
- struct gfs2_glock *gl; /* current glock struct */
- struct seq_file *seq; /* sequence file for debugfs */
- char string[512]; /* scratch space */
+struct gfs2_glock_iter {
+ int hash; /* hash bucket index */
+ struct gfs2_sbd *sdp; /* incore superblock */
+ struct gfs2_glock *gl; /* current glock struct */
+ char string[512]; /* scratch space */
};
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
-static void run_queue(struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
static DECLARE_RWSEM(gfs2_umount_flush_sem);
static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
#endif
/**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
- int flags)
-{
- if (actual == requested)
- return 1;
-
- if (flags & GL_EXACT)
- return 0;
-
- if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
- return 1;
-
- if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
- return 1;
-
- return 0;
-}
-
-/**
* gl_hash() - Turn glock number into hash bucket number
* @lock: The glock number
*
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct inode *aspace = gl->gl_aspace;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
int gfs2_glock_put(struct gfs2_glock *gl)
{
int rv = 0;
- struct gfs2_sbd *sdp = gl->gl_sbd;
write_lock(gl_lock_addr(gl->gl_hash));
if (atomic_dec_and_test(&gl->gl_ref)) {
hlist_del(&gl->gl_list);
write_unlock(gl_lock_addr(gl->gl_hash));
- gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
- gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
- gfs2_assert(sdp, list_empty(&gl->gl_holders));
- gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
- gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+ GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
glock_free(gl);
rv = 1;
goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
return gl;
}
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+ const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+ if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+ gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+ return 0;
+ if (gl->gl_state == gh->gh_state)
+ return 1;
+ if (gh->gh_flags & GL_EXACT)
+ return 0;
+ if (gl->gl_state == LM_ST_EXCLUSIVE) {
+ if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+ return 1;
+ if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+ return 1;
+ }
+ if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+ return 1;
+ return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+ clear_bit(HIF_WAIT, &gh->gh_iflags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ *
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh, *tmp;
+ int ret;
+
+restart:
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (may_grant(gl, gh)) {
+ if (gh->gh_list.prev == &gl->gl_holders &&
+ glops->go_lock) {
+ spin_unlock(&gl->gl_spin);
+ /* FIXME: eliminate this eventually */
+ ret = glops->go_lock(gh);
+ spin_lock(&gl->gl_spin);
+ if (ret) {
+ gh->gh_error = ret;
+ list_del_init(&gh->gh_list);
+ gfs2_holder_wake(gh);
+ goto restart;
+ }
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ gfs2_holder_wake(gh);
+ goto restart;
+ }
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ gfs2_holder_wake(gh);
+ continue;
+ }
+ if (gh->gh_list.prev == &gl->gl_holders)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+ struct gfs2_holder *gh, *tmp;
+
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (ret & LM_OUT_ERROR)
+ gh->gh_error = -EIO;
+ else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+ gh->gh_error = GLR_TRYFAILED;
+ else
+ continue;
+ list_del_init(&gh->gh_list);
+ gfs2_holder_wake(gh);
+ }
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+ int held1, held2;
+
+ held1 = (gl->gl_state != LM_ST_UNLOCKED);
+ held2 = (new_state != LM_ST_UNLOCKED);
+
+ if (held1 != held2) {
+ if (held2)
+ gfs2_glock_hold(gl);
+ else
+ gfs2_glock_put(gl);
+ }
+
+ gl->gl_state = new_state;
+ gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+ gl->gl_demote_state = LM_ST_EXCLUSIVE;
+ clear_bit(GLF_DEMOTE, &gl->gl_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh;
+ unsigned state = ret & LM_OUT_ST_MASK;
+
+ spin_lock(&gl->gl_spin);
+ state_change(gl, state);
+ gh = find_first_waiter(gl);
+
+ /* Demote to UN request arrived during demote to SH or DF */
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+ state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+ gl->gl_target = LM_ST_UNLOCKED;
+
+ /* Check for state != intended state */
+ if (unlikely(state != gl->gl_target)) {
+ if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+ /* move to back of queue and try next entry */
+ if (ret & LM_OUT_CANCELED) {
+ if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh = find_first_waiter(gl);
+ gl->gl_target = gh->gh_state;
+ goto retry;
+ }
+ /* Some error or failed "try lock" - report it */
+ if ((ret & LM_OUT_ERROR) ||
+ (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+ gl->gl_target = gl->gl_state;
+ do_error(gl, ret);
+ goto out;
+ }
+ }
+ switch(state) {
+ /* Unlocked due to conversion deadlock, try again */
+ case LM_ST_UNLOCKED:
+retry:
+ do_xmote(gl, gh, gl->gl_target);
+ break;
+ /* Conversion fails, unlock and try again */
+ case LM_ST_SHARED:
+ case LM_ST_DEFERRED:
+ do_xmote(gl, gh, LM_ST_UNLOCKED);
+ break;
+ default: /* Everything else */
+ printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+ GLOCK_BUG_ON(gl, 1);
+ }
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+ return;
+ }
+
+ /* Fast path - we got what we asked for */
+ if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+ gfs2_demote_wake(gl);
+ if (state != LM_ST_UNLOCKED) {
+ if (glops->go_xmote_bh) {
+ int rv;
+ spin_unlock(&gl->gl_spin);
+ rv = glops->go_xmote_bh(gl, gh);
+ if (rv == -EAGAIN)
+ return;
+ spin_lock(&gl->gl_spin);
+ if (rv) {
+ do_error(gl, rv);
+ goto out;
+ }
+ }
+ do_promote(gl);
+ }
+out:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+}
+
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state, unsigned int req_state,
+ unsigned int flags)
+{
+ int ret = LM_OUT_ERROR;
+
+ if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+ return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+ req_state, flags);
+ return ret;
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ unsigned int lck_flags = gh ? gh->gh_flags : 0;
+ int ret;
+
+ lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+ LM_FLAG_PRIORITY);
+ BUG_ON(gl->gl_state == target);
+ BUG_ON(gl->gl_state == gl->gl_target);
+ if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+ glops->go_inval) {
+ set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ do_error(gl, 0); /* Fail queued try locks */
+ }
+ spin_unlock(&gl->gl_spin);
+ if (glops->go_xmote_th)
+ glops->go_xmote_th(gl);
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+ gfs2_glock_hold(gl);
+ if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+ gl->gl_state == LM_ST_DEFERRED) &&
+ !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ lck_flags |= LM_FLAG_TRY_1CB;
+ ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+
+ if (!(ret & LM_OUT_ASYNC)) {
+ finish_xmote(gl, ret);
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+ } else {
+ GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+ }
+ spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (!list_empty(&gl->gl_holders)) {
+ gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+ struct gfs2_holder *gh = NULL;
+
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+ return;
+
+ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+ gl->gl_demote_state != gl->gl_state) {
+ if (find_first_holder(gl))
+ goto out;
+ if (nonblock)
+ goto out_sched;
+ set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+ GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+ gl->gl_target = gl->gl_demote_state;
+ } else {
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+ gfs2_demote_wake(gl);
+ if (do_promote(gl) == 0)
+ goto out;
+ gh = find_first_waiter(gl);
+ gl->gl_target = gh->gh_state;
+ if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ do_error(gl, 0); /* Fail queued try locks */
+ }
+ do_xmote(gl, gh, gl->gl_target);
+ return;
+
+out_sched:
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+out:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+}
+
static void glock_work_func(struct work_struct *work)
{
+ unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+ if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ finish_xmote(gl, gl->gl_reply);
spin_lock(&gl->gl_spin);
- if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
- set_bit(GLF_DEMOTE, &gl->gl_flags);
- run_queue(gl);
+ if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ gl->gl_state != LM_ST_UNLOCKED &&
+ gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+ unsigned long holdtime, now = jiffies;
+ holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+ if (time_before(now, holdtime))
+ delay = holdtime - now;
+ set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+ }
+ run_queue(gl, 0);
spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
+ if (!delay ||
+ queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+ gfs2_glock_put(gl);
}
static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
void **lockp)
{
int error = -EIO;
+ if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+ return 0;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_name = name;
atomic_set(&gl->gl_ref, 1);
gl->gl_state = LM_ST_UNLOCKED;
+ gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
gl->gl_hash = hash;
- gl->gl_owner_pid = NULL;
- gl->gl_ip = 0;
gl->gl_ops = glops;
- gl->gl_req_gh = NULL;
gl->gl_stamp = jiffies;
gl->gl_tchange = jiffies;
gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
gh->gh_ip = 0;
}
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
- clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb__after_clear_bit();
- wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
-
static int just_schedule(void *word)
{
schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
}
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
- gl->gl_demote_state = LM_ST_EXCLUSIVE;
- clear_bit(GLF_DEMOTE, &gl->gl_flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
-
static void wait_on_demote(struct gfs2_glock *gl)
{
might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
}
/**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_mutex(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- list_del_init(&gh->gh_list);
- /* gh->gh_error never examined. */
- set_bit(GLF_LOCK, &gl->gl_flags);
- clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb();
- wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-
- return 1;
-}
-
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_promote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
- if (list_empty(&gl->gl_holders)) {
- gl->gl_req_gh = gh;
- set_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gh->gh_gl, gh);
- spin_lock(&gl->gl_spin);
- }
- return 1;
- }
-
- if (list_empty(&gl->gl_holders)) {
- set_bit(HIF_FIRST, &gh->gh_iflags);
- set_bit(GLF_LOCK, &gl->gl_flags);
- } else {
- struct gfs2_holder *next_gh;
- if (gh->gh_state == LM_ST_EXCLUSIVE)
- return 1;
- next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
- gh_list);
- if (next_gh->gh_state == LM_ST_EXCLUSIVE)
- return 1;
- }
-
- list_move_tail(&gh->gh_list, &gl->gl_holders);
- gh->gh_error = 0;
- set_bit(HIF_HOLDER, &gh->gh_iflags);
-
- gfs2_holder_wake(gh);
-
- return 0;
-}
-
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_demote(struct gfs2_glock *gl)
-{
- if (!list_empty(&gl->gl_holders))
- return 1;
-
- if (gl->gl_state == gl->gl_demote_state ||
- gl->gl_state == LM_ST_UNLOCKED) {
- gfs2_demote_wake(gl);
- return 0;
- }
-
- set_bit(GLF_LOCK, &gl->gl_flags);
- set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
- if (gl->gl_demote_state == LM_ST_UNLOCKED ||
- gl->gl_state != LM_ST_EXCLUSIVE) {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- } else {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gl, NULL);
- }
-
- spin_lock(&gl->gl_spin);
- clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
- return 0;
-}
-
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
- int blocked = 1;
-
- for (;;) {
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- break;
-
- if (!list_empty(&gl->gl_waiters1)) {
- gh = list_entry(gl->gl_waiters1.next,
- struct gfs2_holder, gh_list);
- blocked = rq_mutex(gh);
- } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
- blocked = rq_demote(gl);
- if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
- !blocked) {
- set_bit(GLF_DEMOTE, &gl->gl_flags);
- gl->gl_demote_state = LM_ST_UNLOCKED;
- }
- clear_bit(GLF_WAITERS2, &gl->gl_flags);
- } else if (!list_empty(&gl->gl_waiters3)) {
- gh = list_entry(gl->gl_waiters3.next,
- struct gfs2_holder, gh_list);
- blocked = rq_promote(gh);
- } else
- break;
-
- if (blocked)
- break;
- }
-}
-
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
- spin_lock(&gl->gl_spin);
- if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder gh;
-
- gfs2_holder_init(gl, 0, 0, &gh);
- set_bit(HIF_WAIT, &gh.gh_iflags);
- list_add_tail(&gh.gh_list, &gl->gl_waiters1);
- spin_unlock(&gl->gl_spin);
- wait_on_holder(&gh);
- gfs2_holder_uninit(&gh);
- } else {
- gl->gl_owner_pid = get_pid(task_pid(current));
- gl->gl_ip = (unsigned long)__builtin_return_address(0);
- spin_unlock(&gl->gl_spin);
- }
-}
-
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
- int acquired = 1;
-
- spin_lock(&gl->gl_spin);
- if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- acquired = 0;
- } else {
- gl->gl_owner_pid = get_pid(task_pid(current));
- gl->gl_ip = (unsigned long)__builtin_return_address(0);
- }
- spin_unlock(&gl->gl_spin);
-
- return acquired;
-}
-
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
- struct pid *pid;
-
- spin_lock(&gl->gl_spin);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- pid = gl->gl_owner_pid;
- gl->gl_owner_pid = NULL;
- gl->gl_ip = 0;
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
-
- put_pid(pid);
-}
-
-/**
* handle_callback - process a demote request
* @gl: the glock
* @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
{
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
- spin_lock(&gl->gl_spin);
set_bit(bit, &gl->gl_flags);
if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
gl->gl_demote_state = state;
gl->gl_demote_time = jiffies;
if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
- gl->gl_object) {
+ gl->gl_object)
gfs2_glock_schedule_for_reclaim(gl);
- spin_unlock(&gl->gl_spin);
- return;
- }
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
- if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
- set_bit(GLF_WAITERS2, &gl->gl_flags);
- else
- gl->gl_demote_state = LM_ST_UNLOCKED;
- }
- spin_unlock(&gl->gl_spin);
-}
-
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
- int held1, held2;
-
- held1 = (gl->gl_state != LM_ST_UNLOCKED);
- held2 = (new_state != LM_ST_UNLOCKED);
-
- if (held1 != held2) {
- if (held2)
- gfs2_glock_hold(gl);
- else
- gfs2_glock_put(gl);
+ gl->gl_demote_state = LM_ST_UNLOCKED;
}
-
- gl->gl_state = new_state;
- gl->gl_tchange = jiffies;
}
/**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- struct gfs2_holder *gh = gl->gl_req_gh;
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !ret);
-
- state_change(gl, LM_ST_UNLOCKED);
-
- if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
- spin_lock(&gl->gl_spin);
- gh->gh_error = 0;
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gl, gl->gl_req_gh);
- gfs2_glock_put(gl);
- return;
- }
-
- spin_lock(&gl->gl_spin);
- gfs2_demote_wake(gl);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
-}
-
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh = gl->gl_req_gh;
- int op_done = 1;
-
- if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
- drop_bh(gl, ret);
- return;
- }
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-
- state_change(gl, ret & LM_OUT_ST_MASK);
-
- /* Deal with each possible exit condition */
-
- if (!gh) {
- gl->gl_stamp = jiffies;
- if (ret & LM_OUT_CANCELED) {
- op_done = 0;
- } else {
- spin_lock(&gl->gl_spin);
- if (gl->gl_state != gl->gl_demote_state) {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- gfs2_glock_put(gl);
- return;
- }
- gfs2_demote_wake(gl);
- spin_unlock(&gl->gl_spin);
- }
- } else {
- spin_lock(&gl->gl_spin);
- if (ret & LM_OUT_CONV_DEADLK) {
- gh->gh_error = 0;
- set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- gfs2_glock_put(gl);
- return;
- }
- list_del_init(&gh->gh_list);
- gh->gh_error = -EIO;
- if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- goto out;
- gh->gh_error = GLR_CANCELED;
- if (ret & LM_OUT_CANCELED)
- goto out;
- if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
- list_add_tail(&gh->gh_list, &gl->gl_holders);
- gh->gh_error = 0;
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- set_bit(HIF_FIRST, &gh->gh_iflags);
- op_done = 0;
- goto out;
- }
- gh->gh_error = GLR_TRYFAILED;
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
- goto out;
- gh->gh_error = -EINVAL;
- if (gfs2_assert_withdraw(sdp, 0) == -1)
- fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
- spin_unlock(&gl->gl_spin);
- }
-
- if (glops->go_xmote_bh)
- glops->go_xmote_bh(gl);
-
- if (op_done) {
- spin_lock(&gl->gl_spin);
- gl->gl_req_gh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- }
-
- gfs2_glock_put(gl);
-
- if (gh)
- gfs2_holder_wake(gh);
-}
-
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state, unsigned int req_state,
- unsigned int flags)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
- req_state, flags);
- return ret;
-}
-
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- int flags = gh ? gh->gh_flags : 0;
- unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
- LM_FLAG_NOEXP | LM_FLAG_ANY |
- LM_FLAG_PRIORITY);
- unsigned int lck_ret;
-
- if (glops->go_xmote_th)
- glops->go_xmote_th(gl);
- if (state == LM_ST_DEFERRED && glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
- gfs2_assert_warn(sdp, state != gl->gl_state);
-
- gfs2_glock_hold(gl);
-
- lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-
- if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
- return;
-
- if (lck_ret & LM_OUT_ASYNC)
- gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
- else
- xmote_bh(gl, lck_ret);
-}
-
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
- return ret;
-}
-
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- unsigned int ret;
-
- if (glops->go_xmote_th)
- glops->go_xmote_th(gl);
- if (glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-
- gfs2_glock_hold(gl);
-
- ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-
- if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
- return;
-
- if (!ret)
- drop_bh(gl, ret);
- else
- gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-
-static void do_cancels(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- spin_lock(&gl->gl_spin);
-
- while (gl->gl_req_gh != gh &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
- !list_empty(&gh->gh_list)) {
- if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
- spin_unlock(&gl->gl_spin);
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
- msleep(100);
- spin_lock(&gl->gl_spin);
- } else {
- spin_unlock(&gl->gl_spin);
- msleep(100);
- spin_lock(&gl->gl_spin);
- }
- }
-
- spin_unlock(&gl->gl_spin);
-}
-
-/**
- * glock_wait_internal - wait on a glock acquisition
+ * gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
* Returns: 0 on success
*/
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
-
- if (test_bit(HIF_ABORTED, &gh->gh_iflags))
- return -EIO;
-
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- spin_lock(&gl->gl_spin);
- if (gl->gl_req_gh != gh &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
- !list_empty(&gh->gh_list)) {
- list_del_init(&gh->gh_list);
- gh->gh_error = GLR_TRYFAILED;
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
- return gh->gh_error;
- }
- spin_unlock(&gl->gl_spin);
- }
-
- if (gh->gh_flags & LM_FLAG_PRIORITY)
- do_cancels(gh);
-
wait_on_holder(gh);
- if (gh->gh_error)
- return gh->gh_error;
-
- gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
- gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
- gh->gh_flags));
-
- if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-
- if (glops->go_lock) {
- gh->gh_error = glops->go_lock(gh);
- if (gh->gh_error) {
- spin_lock(&gl->gl_spin);
- list_del_init(&gh->gh_list);
- spin_unlock(&gl->gl_spin);
- }
- }
-
- spin_lock(&gl->gl_spin);
- gl->gl_req_gh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
- }
-
return gh->gh_error;
}
-static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
- struct gfs2_holder *gh;
-
- list_for_each_entry(gh, head, gh_list) {
- if (gh->gh_owner_pid == pid)
- return gh;
- }
-
- return NULL;
-}
-
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
- if (gi) {
+ if (seq) {
+ struct gfs2_glock_iter *gi = seq->private;
vsprintf(gi->string, fmt, args);
- seq_printf(gi->seq, gi->string);
- }
- else
+ seq_printf(seq, gi->string);
+ } else {
+ printk(KERN_ERR " ");
vprintk(fmt, args);
+ }
va_end(args);
}
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
* add_to_queue - Add a holder to the wait queue (but look for recursion)
* @gh: the holder structure to add
*
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ *
*/
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_holder *existing;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct list_head *insert_pt = NULL;
+ struct gfs2_holder *gh2;
+ int try_lock = 0;
BUG_ON(gh->gh_owner_pid == NULL);
if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
BUG();
- if (!(gh->gh_flags & GL_FLOCK)) {
- existing = find_holder_by_owner(&gl->gl_holders,
- gh->gh_owner_pid);
- if (existing) {
- print_symbol(KERN_WARNING "original: %s\n",
- existing->gh_ip);
- printk(KERN_INFO "pid : %d\n",
- pid_nr(existing->gh_owner_pid));
- printk(KERN_INFO "lock type : %d lock state : %d\n",
- existing->gh_gl->gl_name.ln_type,
- existing->gh_gl->gl_state);
- print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
- printk(KERN_INFO "pid : %d\n",
- pid_nr(gh->gh_owner_pid));
- printk(KERN_INFO "lock type : %d lock state : %d\n",
- gl->gl_name.ln_type, gl->gl_state);
- BUG();
- }
-
- existing = find_holder_by_owner(&gl->gl_waiters3,
- gh->gh_owner_pid);
- if (existing) {
- print_symbol(KERN_WARNING "original: %s\n",
- existing->gh_ip);
- print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
- BUG();
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ try_lock = 1;
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ goto fail;
+ }
+
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+ if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+ goto trap_recursive;
+ if (try_lock &&
+ !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+ !may_grant(gl, gh)) {
+fail:
+ gh->gh_error = GLR_TRYFAILED;
+ gfs2_holder_wake(gh);
+ return;
}
+ if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+ continue;
+ if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+ insert_pt = &gh2->gh_list;
+ }
+ if (likely(insert_pt == NULL)) {
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
+ if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+ goto do_cancel;
+ return;
+ }
+ list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+ gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+ spin_unlock(&gl->gl_spin);
+ if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+ sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+ spin_lock(&gl->gl_spin);
}
+ return;
- if (gh->gh_flags & LM_FLAG_PRIORITY)
- list_add(&gh->gh_list, &gl->gl_waiters3);
- else
- list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+trap_recursive:
+ print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+ printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+ printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+ print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+ printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+ printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ gh->gh_gl->gl_name.ln_type, gh->gh_state);
+ __dump_glock(NULL, gl);
+ BUG();
}
/**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
struct gfs2_sbd *sdp = gl->gl_sbd;
int error = 0;
-restart:
- if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
- set_bit(HIF_ABORTED, &gh->gh_iflags);
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
return -EIO;
- }
spin_lock(&gl->gl_spin);
add_to_queue(gh);
- run_queue(gl);
+ run_queue(gl, 1);
spin_unlock(&gl->gl_spin);
- if (!(gh->gh_flags & GL_ASYNC)) {
- error = glock_wait_internal(gh);
- if (error == GLR_CANCELED) {
- msleep(100);
- goto restart;
- }
- }
+ if (!(gh->gh_flags & GL_ASYNC))
+ error = gfs2_glock_wait(gh);
return error;
}
@@ -1196,48 +980,7 @@ restart:
int gfs2_glock_poll(struct gfs2_holder *gh)
{
- struct gfs2_glock *gl = gh->gh_gl;
- int ready = 0;
-
- spin_lock(&gl->gl_spin);
-
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- ready = 1;
- else if (list_empty(&gh->gh_list)) {
- if (gh->gh_error == GLR_CANCELED) {
- spin_unlock(&gl->gl_spin);
- msleep(100);
- if (gfs2_glock_nq(gh))
- return 1;
- return 0;
- } else
- ready = 1;
- }
-
- spin_unlock(&gl->gl_spin);
-
- return ready;
-}
-
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
- int error;
-
- error = glock_wait_internal(gh);
- if (error == GLR_CANCELED) {
- msleep(100);
- gh->gh_flags &= ~GL_ASYNC;
- error = gfs2_glock_nq(gh);
- }
-
- return error;
+ return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
/**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
struct gfs2_glock *gl = gh->gh_gl;
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned delay = 0;
+ int fast_path = 0;
+ spin_lock(&gl->gl_spin);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_lock(gl);
-
- spin_lock(&gl->gl_spin);
list_del_init(&gh->gh_list);
-
- if (list_empty(&gl->gl_holders)) {
+ if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
+ GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
spin_unlock(&gl->gl_spin);
glops->go_unlock(gh);
spin_lock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
}
gl->gl_stamp = jiffies;
+ if (list_empty(&gl->gl_holders) &&
+ !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE, &gl->gl_flags))
+ fast_path = 1;
}
-
- clear_bit(GLF_LOCK, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
+ if (likely(fast_path))
+ return;
gfs2_glock_hold(gl);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
{
int error = -EIO;
+ if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+ return 0;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
{
int error;
- gfs2_glmutex_lock(gl);
-
if (!atomic_read(&gl->gl_lvb_count)) {
error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
- if (error) {
- gfs2_glmutex_unlock(gl);
+ if (error)
return error;
- }
gfs2_glock_hold(gl);
}
atomic_inc(&gl->gl_lvb_count);
- gfs2_glmutex_unlock(gl);
-
return 0;
}
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
gfs2_glock_hold(gl);
- gfs2_glmutex_lock(gl);
-
gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
if (atomic_dec_and_test(&gl->gl_lvb_count)) {
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
gl->gl_lvb = NULL;
gfs2_glock_put(gl);
}
-
- gfs2_glmutex_unlock(gl);
gfs2_glock_put(gl);
}
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
if (time_before(now, holdtime))
delay = holdtime - now;
+ spin_lock(&gl->gl_spin);
handle_callback(gl, state, 1, delay);
+ spin_unlock(&gl->gl_spin);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
gl = gfs2_glock_find(sdp, &async->lc_name);
if (gfs2_assert_warn(sdp, gl))
return;
- xmote_bh(gl, async->lc_ret);
+ gl->gl_reply = async->lc_ret;
+ set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
wake_up_process(sdp->sd_recoverd_process);
return;
- case LM_CB_DROPLOCKS:
- gfs2_gl_hash_clear(sdp, NO_WAIT);
- gfs2_quota_scan(sdp);
- return;
-
default:
gfs2_assert_warn(sdp, 0);
return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
+ int done_callback = 0;
spin_lock(&sdp->sd_reclaim_lock);
if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
atomic_dec(&sdp->sd_reclaim_count);
atomic_inc(&sdp->sd_reclaimed);
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_unlock(gl);
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+ handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ done_callback = 1;
}
-
- gfs2_glock_put(gl);
+ spin_unlock(&gl->gl_spin);
+ if (!done_callback ||
+ queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
}
/**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
{
if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
return;
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ return;
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- goto out_schedule;
- gfs2_glmutex_unlock(gl);
- }
- return;
-
-out_schedule:
- gfs2_glmutex_unlock(gl);
- gfs2_glock_schedule_for_reclaim(gl);
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ gfs2_glock_schedule_for_reclaim(gl);
+ spin_unlock(&gl->gl_spin);
}
/**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
spin_unlock(&sdp->sd_reclaim_lock);
}
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_unlock(gl);
- }
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
}
/**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
* @sdp: the filesystem
* @wait: wait until it's all gone
*
- * Called when unmounting the filesystem, or when inter-node lock manager
- * requests DROPLOCKS because it is running out of capacity.
+ * Called when unmounting the filesystem.
*/
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
{
unsigned long t;
unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
cont = 1;
}
- if (!wait || !cont)
+ if (!cont)
break;
if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
}
}
-/*
- * Diagnostic routines to help debug distributed deadlock
- */
-
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
- unsigned long address)
+static const char *state2str(unsigned state)
{
- char buffer[KSYM_SYMBOL_LEN];
-
- sprint_symbol(buffer, address);
- print_dbg(gi, fmt, buffer);
+ switch(state) {
+ case LM_ST_UNLOCKED:
+ return "UN";
+ case LM_ST_SHARED:
+ return "SH";
+ case LM_ST_DEFERRED:
+ return "DF";
+ case LM_ST_EXCLUSIVE:
+ return "EX";
+ }
+ return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+ char *p = buf;
+ if (flags & LM_FLAG_TRY)
+ *p++ = 't';
+ if (flags & LM_FLAG_TRY_1CB)
+ *p++ = 'T';
+ if (flags & LM_FLAG_NOEXP)
+ *p++ = 'e';
+ if (flags & LM_FLAG_ANY)
+ *p++ = 'a';
+ if (flags & LM_FLAG_PRIORITY)
+ *p++ = 'p';
+ if (flags & GL_ASYNC)
+ *p++ = 'a';
+ if (flags & GL_EXACT)
+ *p++ = 'E';
+ if (flags & GL_ATIME)
+ *p++ = 'a';
+ if (flags & GL_NOCACHE)
+ *p++ = 'c';
+ if (test_bit(HIF_HOLDER, &iflags))
+ *p++ = 'H';
+ if (test_bit(HIF_WAIT, &iflags))
+ *p++ = 'W';
+ if (test_bit(HIF_FIRST, &iflags))
+ *p++ = 'F';
+ *p = 0;
+ return buf;
}
/**
* dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
* @gh: the glock holder
*
* Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_holder(struct glock_iter *gi, char *str,
- struct gfs2_holder *gh)
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
- unsigned int x;
- struct task_struct *gh_owner;
+ struct task_struct *gh_owner = NULL;
+ char buffer[KSYM_SYMBOL_LEN];
+ char flags_buf[32];
- print_dbg(gi, " %s\n", str);
- if (gh->gh_owner_pid) {
- print_dbg(gi, " owner = %ld ",
- (long)pid_nr(gh->gh_owner_pid));
+ sprint_symbol(buffer, gh->gh_ip);
+ if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
- if (gh_owner)
- print_dbg(gi, "(%s)\n", gh_owner->comm);
- else
- print_dbg(gi, "(ended)\n");
- } else
- print_dbg(gi, " owner = -1\n");
- print_dbg(gi, " gh_state = %u\n", gh->gh_state);
- print_dbg(gi, " gh_flags =");
- for (x = 0; x < 32; x++)
- if (gh->gh_flags & (1 << x))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- print_dbg(gi, " error = %d\n", gh->gh_error);
- print_dbg(gi, " gh_iflags =");
- for (x = 0; x < 32; x++)
- if (test_bit(x, &gh->gh_iflags))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip);
-
+ gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+ state2str(gh->gh_state),
+ hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+ gh->gh_error,
+ gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+ gh_owner ? gh_owner->comm : "(ended)", buffer);
return 0;
}
-/**
- * dump_inode - print information about an inode
- * @ip: the inode
- *
- * Returns: 0 on success, -ENOBUFS when we run out of space
- */
-
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
-{
- unsigned int x;
-
- print_dbg(gi, " Inode:\n");
- print_dbg(gi, " num = %llu/%llu\n",
- (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr);
- print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode));
- print_dbg(gi, " i_flags =");
- for (x = 0; x < 32; x++)
- if (test_bit(x, &ip->i_flags))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- return 0;
+static const char *gflags2str(char *buf, const unsigned long *gflags)
+{
+ char *p = buf;
+ if (test_bit(GLF_LOCK, gflags))
+ *p++ = 'l';
+ if (test_bit(GLF_STICKY, gflags))
+ *p++ = 's';
+ if (test_bit(GLF_DEMOTE, gflags))
+ *p++ = 'D';
+ if (test_bit(GLF_PENDING_DEMOTE, gflags))
+ *p++ = 'd';
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+ *p++ = 'p';
+ if (test_bit(GLF_DIRTY, gflags))
+ *p++ = 'y';
+ if (test_bit(GLF_LFLUSH, gflags))
+ *p++ = 'f';
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+ *p++ = 'i';
+ if (test_bit(GLF_REPLY_PENDING, gflags))
+ *p++ = 'r';
+ *p = 0;
+ return buf;
}
/**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
* @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
*
* Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
{
- struct gfs2_holder *gh;
- unsigned int x;
- int error = -ENOBUFS;
- struct task_struct *gl_owner;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ unsigned long long dtime;
+ const struct gfs2_holder *gh;
+ char gflags_buf[32];
+ int error = 0;
- spin_lock(&gl->gl_spin);
+ dtime = jiffies - gl->gl_demote_time;
+ dtime *= 1000000/HZ; /* demote time in uSec */
+ if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+ dtime = 0;
+ gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+ state2str(gl->gl_state),
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number,
+ gflags2str(gflags_buf, &gl->gl_flags),
+ state2str(gl->gl_target),
+ state2str(gl->gl_demote_state), dtime,
+ atomic_read(&gl->gl_lvb_count),
+ atomic_read(&gl->gl_ail_count),
+ atomic_read(&gl->gl_ref));
- print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
- print_dbg(gi, " gl_flags =");
- for (x = 0; x < 32; x++) {
- if (test_bit(x, &gl->gl_flags))
- print_dbg(gi, " %u", x);
- }
- if (!test_bit(GLF_LOCK, &gl->gl_flags))
- print_dbg(gi, " (unlocked)");
- print_dbg(gi, " \n");
- print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref));
- print_dbg(gi, " gl_state = %u\n", gl->gl_state);
- if (gl->gl_owner_pid) {
- gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
- if (gl_owner)
- print_dbg(gi, " gl_owner = pid %d (%s)\n",
- pid_nr(gl->gl_owner_pid), gl_owner->comm);
- else
- print_dbg(gi, " gl_owner = %d (ended)\n",
- pid_nr(gl->gl_owner_pid));
- } else
- print_dbg(gi, " gl_owner = -1\n");
- print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
- print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
- print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
- print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
- print_dbg(gi, " reclaim = %s\n",
- (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
- if (gl->gl_aspace)
- print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
- gl->gl_aspace->i_mapping->nrpages);
- else
- print_dbg(gi, " aspace = no\n");
- print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count));
- if (gl->gl_req_gh) {
- error = dump_holder(gi, "Request", gl->gl_req_gh);
- if (error)
- goto out;
- }
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- error = dump_holder(gi, "Holder", gh);
+ error = dump_holder(seq, gh);
if (error)
goto out;
}
- list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
- error = dump_holder(gi, "Waiter1", gh);
- if (error)
- goto out;
- }
- list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
- error = dump_holder(gi, "Waiter3", gh);
- if (error)
- goto out;
- }
- if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
- print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
- gl->gl_demote_state, (unsigned long long)
- (jiffies - gl->gl_demote_time)*(1000000/HZ));
- }
- if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
- if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
- list_empty(&gl->gl_holders)) {
- error = dump_inode(gi, gl->gl_object);
- if (error)
- goto out;
- } else {
- error = -ENOBUFS;
- print_dbg(gi, " Inode: busy\n");
- }
- }
-
- error = 0;
-
+ if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+ error = glops->go_dump(seq, gl);
out:
- spin_unlock(&gl->gl_spin);
return error;
}
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+ int ret;
+ spin_lock(&gl->gl_spin);
+ ret = __dump_glock(seq, gl);
+ spin_unlock(&gl->gl_spin);
+ return ret;
+}
+
/**
* gfs2_dump_lockstate - print out the current lockstate
* @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
struct gfs2_glock *gl;
@@ -2104,7 +1824,7 @@ restart:
gfs2_glock_put(gl);
if (gl && gi->gl == NULL)
gi->hash++;
- while(gi->gl == NULL) {
+ while (gi->gl == NULL) {
if (gi->hash >= GFS2_GL_HASH_SIZE)
return 1;
read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
return 0;
}
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
{
if (gi->gl)
gfs2_glock_put(gi->gl);
- kfree(gi);
-}
-
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
- struct glock_iter *gi;
-
- gi = kmalloc(sizeof (*gi), GFP_KERNEL);
- if (!gi)
- return NULL;
-
- gi->sdp = sdp;
- gi->hash = 0;
- gi->seq = NULL;
gi->gl = NULL;
- memset(gi->string, 0, sizeof(gi->string));
-
- if (gfs2_glock_iter_next(gi)) {
- gfs2_glock_iter_free(gi);
- return NULL;
- }
-
- return gi;
}
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct glock_iter *gi;
+ struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
- gi = gfs2_glock_iter_init(file->private);
- if (!gi)
- return NULL;
+ gi->hash = 0;
- while(n--) {
+ do {
if (gfs2_glock_iter_next(gi)) {
gfs2_glock_iter_free(gi);
return NULL;
}
- }
+ } while (n--);
- return gi;
+ return gi->gl;
}
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
- struct glock_iter *gi = iter_ptr;
+ struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
return NULL;
}
- return gi;
+ return gi->gl;
}
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
- struct glock_iter *gi = iter_ptr;
- if (gi)
- gfs2_glock_iter_free(gi);
+ struct gfs2_glock_iter *gi = seq->private;
+ gfs2_glock_iter_free(gi);
}
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct glock_iter *gi = iter_ptr;
-
- gi->seq = file;
- dump_glock(gi, gi->gl);
-
- return 0;
+ return dump_glock(seq, iter_ptr);
}
static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
static int gfs2_debugfs_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int ret;
-
- ret = seq_open(file, &gfs2_glock_seq_ops);
- if (ret)
- return ret;
-
- seq = file->private_data;
- seq->private = inode->i_private;
-
- return 0;
+ int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+ sizeof(struct gfs2_glock_iter));
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+ gi->sdp = inode->i_private;
+ }
+ return ret;
}
static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
.open = gfs2_debugfs_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release
+ .release = seq_release_private,
};
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f815..971d92af70f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
#define GL_SKIP 0x00000100
#define GL_ATIME 0x00000200
#define GL_NOCACHE 0x00000400
-#define GL_FLOCK 0x00000800
-#define GL_NOCANCEL 0x00001000
#define GLR_TRYFAILED 13
-#define GLR_CANCELED 14
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
spin_lock(&gl->gl_spin);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ break;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
{
int ret;
spin_lock(&gl->gl_spin);
- ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+ ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
return ret;
}
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
* gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
void gfs2_lvb_unhold(struct gfs2_glock *gl);
void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-
void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
int __init gfs2_glock_init(void);
void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda..c6c318c2a0f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
+#include <linux/bio.h>
#include "gfs2.h"
#include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
}
/**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh = gl->gl_req_gh;
- struct buffer_head *bh;
- int error;
-
- if (gl->gl_state != LM_ST_UNLOCKED &&
- (!gh || !(gh->gh_flags & GL_SKIP))) {
- error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
- if (!error)
- brelse(bh);
- }
-}
-
-/**
* inode_go_inval - prepare a inode glock to be released
* @gl: the glock
* @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
}
/**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+ const struct gfs2_inode *ip = gl->gl_object;
+ if (ip == NULL)
+ return 0;
+ gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+ (unsigned long long)ip->i_no_formal_ino,
+ (unsigned long long)ip->i_no_addr,
+ IF2DT(ip->i_inode.i_mode), ip->i_flags);
+ return 0;
+}
+
+/**
* rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
* @gl: the glock
*
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
}
/**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+ const struct gfs2_rgrpd *rgd = gl->gl_object;
+ if (rgd == NULL)
+ return 0;
+ gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+ return 0;
+}
+
+/**
* trans_go_sync - promote/demote the transaction glock
* @gl: the glock
* @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
*
*/
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
struct gfs2_log_header_host head;
int error;
- if (gl->gl_state != LM_ST_UNLOCKED &&
- test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
gfs2_log_pointers_init(sdp, head.lh_blkno);
}
}
+ return 0;
}
/**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
const struct gfs2_glock_operations gfs2_inode_glops = {
.go_xmote_th = inode_go_sync,
- .go_xmote_bh = inode_go_xmote_bh,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_lock = inode_go_lock,
+ .go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
- .go_min_hold_time = HZ / 10,
+ .go_min_hold_time = HZ / 5,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_demote_ok = rgrp_go_demote_ok,
.go_lock = rgrp_go_lock,
.go_unlock = rgrp_go_unlock,
+ .go_dump = rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
- .go_min_hold_time = HZ / 10,
+ .go_min_hold_time = HZ / 5,
};
const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41d..448697a5c46 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
struct gfs2_rgrpd {
struct list_head rd_list; /* Link with superblock */
struct list_head rd_list_mru;
- struct list_head rd_recent; /* Recently used rgrps */
struct gfs2_glock *rd_gl; /* Glock for this rgrp */
u64 rd_addr; /* grp block disk address */
u64 rd_data0; /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
struct gfs2_glock_operations {
void (*go_xmote_th) (struct gfs2_glock *gl);
- void (*go_xmote_bh) (struct gfs2_glock *gl);
+ int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
+ int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
const int go_type;
const unsigned long go_min_hold_time;
};
enum {
/* States */
- HIF_HOLDER = 6,
+ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_FIRST = 7,
- HIF_ABORTED = 9,
HIF_WAIT = 10,
};
@@ -154,20 +153,20 @@ struct gfs2_holder {
unsigned gh_flags;
int gh_error;
- unsigned long gh_iflags;
+ unsigned long gh_iflags; /* HIF_... */
unsigned long gh_ip;
};
enum {
- GLF_LOCK = 1,
- GLF_STICKY = 2,
- GLF_DEMOTE = 3,
- GLF_PENDING_DEMOTE = 4,
- GLF_DIRTY = 5,
- GLF_DEMOTE_IN_PROGRESS = 6,
- GLF_LFLUSH = 7,
- GLF_WAITERS2 = 8,
- GLF_CONV_DEADLK = 9,
+ GLF_LOCK = 1,
+ GLF_STICKY = 2,
+ GLF_DEMOTE = 3,
+ GLF_PENDING_DEMOTE = 4,
+ GLF_DEMOTE_IN_PROGRESS = 5,
+ GLF_DIRTY = 6,
+ GLF_LFLUSH = 7,
+ GLF_INVALIDATE_IN_PROGRESS = 8,
+ GLF_REPLY_PENDING = 9,
};
struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
spinlock_t gl_spin;
unsigned int gl_state;
+ unsigned int gl_target;
+ unsigned int gl_reply;
unsigned int gl_hash;
unsigned int gl_demote_state; /* state requested by remote node */
unsigned long gl_demote_time; /* time of first demote request */
- struct pid *gl_owner_pid;
- unsigned long gl_ip;
struct list_head gl_holders;
- struct list_head gl_waiters1; /* HIF_MUTEX */
- struct list_head gl_waiters3; /* HIF_PROMOTE */
const struct gfs2_glock_operations *gl_ops;
-
- struct gfs2_holder *gl_req_gh;
-
void *gl_lock;
char *gl_lvb;
atomic_t gl_lvb_count;
@@ -427,7 +421,6 @@ struct gfs2_tune {
unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
unsigned int gt_atime_quantum; /* Min secs between atime updates */
unsigned int gt_new_files_jdata;
- unsigned int gt_new_files_directio;
unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
unsigned int gt_stall_secs; /* Detects trouble! */
unsigned int gt_complain_secs;
@@ -534,7 +527,6 @@ struct gfs2_sbd {
struct mutex sd_rindex_mutex;
struct list_head sd_rindex_list;
struct list_head sd_rindex_mru_list;
- struct list_head sd_rindex_recent_list;
struct gfs2_rgrpd *sd_rindex_forward;
unsigned int sd_rgrps;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e4..6da0ab355b8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
}
if (!is_root) {
- error = permission(dir, MAY_EXEC, NULL);
+ error = gfs2_permission(dir, MAY_EXEC);
if (error)
goto out;
}
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
{
int error;
- error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
gfs2_tune_get(sdp, gt_new_files_jdata))
di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
- if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
- gfs2_tune_get(sdp, gt_new_files_directio))
- di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
} else if (S_ISDIR(mode)) {
di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
- GFS2_DIF_INHERIT_DIRECTIO);
- di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
GFS2_DIF_INHERIT_JDATA);
}
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
if (IS_APPEND(&dip->i_inode))
return -EPERM;
- error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38..6074c2506f7 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
}
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
void gfs2_set_iop(struct inode *inode);
struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
struct gfs2_inode *ip);
int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee72878..523243a13a2 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
const struct lm_lockops *lw_ops;
};
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj);
+
/* List of registered low-level locking protocols. A file system selects one
of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static const struct lm_lockops nolock_ops = {
+ .lm_proto_name = "lock_nolock",
+ .lm_mount = nolock_mount,
+};
+
+static struct lmh_wrapper nolock_proto = {
+ .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+ .lw_ops = &nolock_ops,
+};
+
static LIST_HEAD(lmh_list);
static DEFINE_MUTEX(lmh_lock);
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj)
+{
+ char *c;
+ unsigned int jid;
+
+ c = strstr(host_data, "jid=");
+ if (!c)
+ jid = 0;
+ else {
+ c += 4;
+ sscanf(c, "%u", &jid);
+ }
+
+ lockstruct->ls_jid = jid;
+ lockstruct->ls_first = 1;
+ lockstruct->ls_lvb_size = min_lvb_size;
+ lockstruct->ls_ops = &nolock_ops;
+ lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+ return 0;
+}
+
/**
* gfs2_register_lockproto - Register a low-level locking protocol
* @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
int try = 0;
int error, found;
+
retry:
mutex_lock(&lmh_lock);
+ if (list_empty(&nolock_proto.lw_list))
+ list_add(&nolock_proto.lw_list, &lmh_list);
+
found = 0;
list_for_each_entry(lw, &lmh_list, lw_list) {
if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
goto out;
}
- if (!try_module_get(lw->lw_ops->lm_owner)) {
+ if (lw->lw_ops->lm_owner &&
+ !try_module_get(lw->lw_ops->lm_owner)) {
try = 0;
mutex_unlock(&lmh_lock);
msleep(1000);
@@ -158,7 +205,8 @@ out:
void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
{
mutex_lock(&lmh_lock);
- lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+ if (lockstruct->ls_ops->lm_unmount)
+ lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
if (lockstruct->ls_ops->lm_owner)
module_put(lockstruct->ls_ops->lm_owner);
mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec8..2482c904750 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
static char junk_lvb[GDLM_LVB_SIZE];
-static void queue_complete(struct gdlm_lock *lp)
+
+/* convert dlm lock-mode to gfs lock-state */
+
+static s16 gdlm_make_lmstate(s16 dlmmode)
{
- struct gdlm_ls *ls = lp->ls;
+ switch (dlmmode) {
+ case DLM_LOCK_IV:
+ case DLM_LOCK_NL:
+ return LM_ST_UNLOCKED;
+ case DLM_LOCK_EX:
+ return LM_ST_EXCLUSIVE;
+ case DLM_LOCK_CW:
+ return LM_ST_DEFERRED;
+ case DLM_LOCK_PR:
+ return LM_ST_SHARED;
+ }
+ gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+ return -1;
+}
- clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+ thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
spin_lock(&ls->async_lock);
- list_add_tail(&lp->clist, &ls->complete);
+ list_add_tail(&lp->delay_list, &ls->submit);
spin_unlock(&ls->async_lock);
wake_up(&ls->thread_wait);
}
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
{
- queue_complete(astarg);
+ clear_bit(LFL_AST_WAIT, &lp->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&lp->flags, LFL_AST_WAIT);
}
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
{
- struct gdlm_lock *lp = astarg;
struct gdlm_ls *ls = lp->ls;
- if (!mode) {
- printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- return;
- }
-
spin_lock(&ls->async_lock);
- if (!lp->bast_mode) {
- list_add_tail(&lp->blist, &ls->blocking);
- lp->bast_mode = mode;
- } else if (lp->bast_mode < mode)
- lp->bast_mode = mode;
+ if (!list_empty(&lp->delay_list))
+ list_del_init(&lp->delay_list);
+ ls->all_locks_count--;
spin_unlock(&ls->async_lock);
- wake_up(&ls->thread_wait);
+
+ kfree(lp);
}
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
{
struct gdlm_ls *ls = lp->ls;
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
spin_unlock(&ls->async_lock);
}
+static void process_complete(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+ struct lm_async_cb acb;
+
+ memset(&acb, 0, sizeof(acb));
+
+ if (lp->lksb.sb_status == -DLM_ECANCEL) {
+ log_info("complete dlm cancel %x,%llx flags %lx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+ if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+ log_info("unlock sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ lp->cur = DLM_LOCK_IV;
+ lp->req = DLM_LOCK_IV;
+ lp->lksb.sb_lkid = 0;
+
+ if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+ gdlm_delete_lp(lp);
+ return;
+ }
+ goto out;
+ }
+
+ if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+ memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+ if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+ if (lp->req == DLM_LOCK_PR)
+ lp->req = DLM_LOCK_CW;
+ else if (lp->req == DLM_LOCK_CW)
+ lp->req = DLM_LOCK_PR;
+ }
+
+ /*
+ * A canceled lock request. The lock was just taken off the delayed
+ * list and was never even submitted to dlm.
+ */
+
+ if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+ log_info("complete internal cancel %x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ goto out;
+ }
+
+ /*
+ * An error occured.
+ */
+
+ if (lp->lksb.sb_status) {
+ /* a "normal" error */
+ if ((lp->lksb.sb_status == -EAGAIN) &&
+ (lp->lkf & DLM_LKF_NOQUEUE)) {
+ lp->req = lp->cur;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ /* this could only happen with cancels I think */
+ log_info("ast sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ /*
+ * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+ */
+
+ if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+ wake_up_ast(lp);
+ return;
+ }
+
+ /*
+ * A lock has been demoted to NL because it initially completed during
+ * BLOCK_LOCKS. Now it must be requested in the originally requested
+ * mode.
+ */
+
+ if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+ gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+
+ lp->cur = DLM_LOCK_NL;
+ lp->req = lp->prev_req;
+ lp->prev_req = DLM_LOCK_IV;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags))
+ gdlm_queue_delayed(lp);
+ else
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * A request is granted during dlm recovery. It may be granted
+ * because the locks of a failed node were cleared. In that case,
+ * there may be inconsistent data beneath this lock and we must wait
+ * for recovery to complete to use it. When gfs recovery is done this
+ * granted lock will be converted to NL and then reacquired in this
+ * granted state.
+ */
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags) &&
+ lp->req != DLM_LOCK_NL) {
+
+ lp->cur = lp->req;
+ lp->prev_req = lp->req;
+ lp->req = DLM_LOCK_NL;
+ lp->lkf |= DLM_LKF_CONVERT;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ log_debug("rereq %x,%llx id %x %d,%d",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->lksb.sb_lkid, lp->cur, lp->req);
+
+ set_bit(LFL_REREQUEST, &lp->flags);
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * DLM demoted the lock to NL before it was granted so GFS must be
+ * told it cannot cache data for this lock.
+ */
+
+ if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+ /*
+ * This is an internal lock_dlm lock
+ */
+
+ if (test_bit(LFL_INLOCK, &lp->flags)) {
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+ wake_up_ast(lp);
+ return;
+ }
+
+ /*
+ * Normal completion of a lock request. Tell GFS it now has the lock.
+ */
+
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+
+ acb.lc_name = lp->lockname;
+ acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+ ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static void gdlm_ast(void *astarg)
+{
+ struct gdlm_lock *lp = astarg;
+ clear_bit(LFL_ACTIVE, &lp->flags);
+ process_complete(lp);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+ struct gdlm_ls *ls = lp->ls;
+ unsigned int cb = 0;
+
+ switch (gdlm_make_lmstate(bast_mode)) {
+ case LM_ST_EXCLUSIVE:
+ cb = LM_CB_NEED_E;
+ break;
+ case LM_ST_DEFERRED:
+ cb = LM_CB_NEED_D;
+ break;
+ case LM_ST_SHARED:
+ cb = LM_CB_NEED_S;
+ break;
+ default:
+ gdlm_assert(0, "unknown bast mode %u", bast_mode);
+ }
+
+ ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+
+static void gdlm_bast(void *astarg, int mode)
+{
+ struct gdlm_lock *lp = astarg;
+
+ if (!mode) {
+ printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ return;
+ }
+
+ process_blocking(lp, mode);
+}
+
/* convert gfs lock-state to dlm lock-mode */
static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
return -1;
}
-/* convert dlm lock-mode to gfs lock-state */
-
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
- switch (dlmmode) {
- case DLM_LOCK_IV:
- case DLM_LOCK_NL:
- return LM_ST_UNLOCKED;
- case DLM_LOCK_EX:
- return LM_ST_EXCLUSIVE;
- case DLM_LOCK_CW:
- return LM_ST_DEFERRED;
- case DLM_LOCK_PR:
- return LM_ST_SHARED;
- }
- gdlm_assert(0, "unknown DLM mode %d", dlmmode);
- return -1;
-}
/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
if (lp->lksb.sb_lkid != 0) {
lkf |= DLM_LKF_CONVERT;
-
- /* Conversion deadlock avoidance by DLM */
-
- if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
- !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
- !(lkf & DLM_LKF_NOQUEUE) &&
- cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
- lkf |= DLM_LKF_CONVDEADLK;
}
if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
make_strname(name, &lp->strname);
lp->ls = ls;
lp->cur = DLM_LOCK_IV;
- lp->lvb = NULL;
- lp->hold_null = NULL;
- INIT_LIST_HEAD(&lp->clist);
- INIT_LIST_HEAD(&lp->blist);
INIT_LIST_HEAD(&lp->delay_list);
spin_lock(&ls->async_lock);
- list_add(&lp->all_list, &ls->all_locks);
ls->all_locks_count++;
spin_unlock(&ls->async_lock);
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
return 0;
}
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
-
- spin_lock(&ls->async_lock);
- if (!list_empty(&lp->clist))
- list_del_init(&lp->clist);
- if (!list_empty(&lp->blist))
- list_del_init(&lp->blist);
- if (!list_empty(&lp->delay_list))
- list_del_init(&lp->delay_list);
- gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- list_del_init(&lp->all_list);
- ls->all_locks_count--;
- spin_unlock(&ls->async_lock);
-
- kfree(lp);
-}
-
int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
void **lockp)
{
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
lp->lksb.sb_status = -EAGAIN;
- queue_complete(lp);
+ gdlm_ast(lp);
error = 0;
}
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
{
struct gdlm_lock *lp = lock;
+ if (req_state == LM_ST_UNLOCKED)
+ return gdlm_unlock(lock, cur_state);
+
+ if (req_state == LM_ST_UNLOCKED)
+ return gdlm_unlock(lock, cur_state);
+
clear_bit(LFL_DLM_CANCEL, &lp->flags);
if (flags & LM_FLAG_NOEXP)
set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
if (delay_list) {
set_bit(LFL_CANCEL, &lp->flags);
set_bit(LFL_ACTIVE, &lp->flags);
- queue_complete(lp);
+ gdlm_ast(lp);
return;
}
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
wake_up(&ls->thread_wait);
}
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
- struct gdlm_lock *lp, *safe;
- int count = 0;
-
- spin_lock(&ls->async_lock);
- list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
- list_del_init(&lp->all_list);
-
- if (lp->lvb && lp->lvb != junk_lvb)
- kfree(lp->lvb);
- kfree(lp);
- count++;
- }
- spin_unlock(&ls->async_lock);
-
- return count;
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54..3c98e7c6f93 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
int recover_jid_done;
int recover_jid_status;
spinlock_t async_lock;
- struct list_head complete;
- struct list_head blocking;
struct list_head delayed;
struct list_head submit;
- struct list_head all_locks;
u32 all_locks_count;
wait_queue_head_t wait_control;
- struct task_struct *thread1;
- struct task_struct *thread2;
+ struct task_struct *thread;
wait_queue_head_t thread_wait;
- unsigned long drop_time;
- int drop_locks_count;
- int drop_locks_period;
};
enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
u32 lkf; /* dlm flags DLM_LKF_ */
unsigned long flags; /* lock_dlm flags LFL_ */
- int bast_mode; /* protected by async_lock */
-
- struct list_head clist; /* complete */
- struct list_head blist; /* blocking */
struct list_head delay_list; /* delayed */
- struct list_head all_list; /* all locks for the fs */
struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
};
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
/* lock.c */
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
unsigned int gdlm_do_lock(struct gdlm_lock *);
int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b5..09d78c216f4 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
if (!ls)
return NULL;
- ls->drop_locks_count = GDLM_DROP_COUNT;
- ls->drop_locks_period = GDLM_DROP_PERIOD;
ls->fscb = cb;
ls->sdp = sdp;
ls->fsflags = flags;
spin_lock_init(&ls->async_lock);
- INIT_LIST_HEAD(&ls->complete);
- INIT_LIST_HEAD(&ls->blocking);
INIT_LIST_HEAD(&ls->delayed);
INIT_LIST_HEAD(&ls->submit);
- INIT_LIST_HEAD(&ls->all_locks);
init_waitqueue_head(&ls->thread_wait);
init_waitqueue_head(&ls->wait_control);
- ls->thread1 = NULL;
- ls->thread2 = NULL;
- ls->drop_time = jiffies;
ls->jid = -1;
strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
static void gdlm_unmount(void *lockspace)
{
struct gdlm_ls *ls = lockspace;
- int rv;
log_debug("unmount flags %lx", ls->flags);
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
gdlm_kobject_release(ls);
dlm_release_lockspace(ls->dlm_lockspace, 2);
gdlm_release_threads(ls);
- rv = gdlm_release_all_locks(ls);
- if (rv)
- log_info("gdlm_unmount: %d stray locks freed", rv);
+ BUG_ON(ls->all_locks_count);
out:
kfree(ls);
}
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
dlm_release_lockspace(ls->dlm_lockspace, 2);
gdlm_release_threads(ls);
- gdlm_release_all_locks(ls);
gdlm_kobject_release(ls);
}
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9e..4ec571c3d8a 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
return sprintf(buf, "%d\n", ls->recover_jid_status);
}
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
- return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
- ls->drop_locks_count = simple_strtol(buf, NULL, 0);
- return len;
-}
-
struct gdlm_attr {
struct attribute attr;
ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL);
GDLM_ATTR(recover, 0644, recover_show, recover_store);
GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store);
static struct attribute *gdlm_attrs[] = {
&gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
&gdlm_attr_recover.attr,
&gdlm_attr_recover_done.attr,
&gdlm_attr_recover_status.attr,
- &gdlm_attr_drop_count.attr,
NULL,
};
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28a..38823efd698 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
#include "lock_dlm.h"
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
- thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
-
- spin_lock(&ls->async_lock);
- list_add_tail(&lp->delay_list, &ls->submit);
- spin_unlock(&ls->async_lock);
- wake_up(&ls->thread_wait);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
- struct gdlm_ls *ls = lp->ls;
- unsigned int cb = 0;
-
- switch (gdlm_make_lmstate(bast_mode)) {
- case LM_ST_EXCLUSIVE:
- cb = LM_CB_NEED_E;
- break;
- case LM_ST_DEFERRED:
- cb = LM_CB_NEED_D;
- break;
- case LM_ST_SHARED:
- cb = LM_CB_NEED_S;
- break;
- default:
- gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
- }
-
- ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
- clear_bit(LFL_AST_WAIT, &lp->flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
- struct lm_async_cb acb;
- s16 prev_mode = lp->cur;
-
- memset(&acb, 0, sizeof(acb));
-
- if (lp->lksb.sb_status == -DLM_ECANCEL) {
- log_info("complete dlm cancel %x,%llx flags %lx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
-
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CANCELED;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- }
-
- if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
- if (lp->lksb.sb_status != -DLM_EUNLOCK) {
- log_info("unlock sb_status %d %x,%llx flags %lx",
- lp->lksb.sb_status, lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
- return;
- }
-
- lp->cur = DLM_LOCK_IV;
- lp->req = DLM_LOCK_IV;
- lp->lksb.sb_lkid = 0;
-
- if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
- gdlm_delete_lp(lp);
- return;
- }
- goto out;
- }
-
- if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
- memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
- if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
- if (lp->req == DLM_LOCK_PR)
- lp->req = DLM_LOCK_CW;
- else if (lp->req == DLM_LOCK_CW)
- lp->req = DLM_LOCK_PR;
- }
-
- /*
- * A canceled lock request. The lock was just taken off the delayed
- * list and was never even submitted to dlm.
- */
-
- if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
- log_info("complete internal cancel %x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CANCELED;
- goto out;
- }
-
- /*
- * An error occured.
- */
-
- if (lp->lksb.sb_status) {
- /* a "normal" error */
- if ((lp->lksb.sb_status == -EAGAIN) &&
- (lp->lkf & DLM_LKF_NOQUEUE)) {
- lp->req = lp->cur;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- }
-
- /* this could only happen with cancels I think */
- log_info("ast sb_status %d %x,%llx flags %lx",
- lp->lksb.sb_status, lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
- if (lp->lksb.sb_status == -EDEADLOCK &&
- lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CONV_DEADLK;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- } else
- return;
- }
-
- /*
- * This is an AST for an EX->EX conversion for sync_lvb from GFS.
- */
-
- if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
- wake_up_ast(lp);
- return;
- }
-
- /*
- * A lock has been demoted to NL because it initially completed during
- * BLOCK_LOCKS. Now it must be requested in the originally requested
- * mode.
- */
-
- if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
- gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
-
- lp->cur = DLM_LOCK_NL;
- lp->req = lp->prev_req;
- lp->prev_req = DLM_LOCK_IV;
- lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
- set_bit(LFL_NOCACHE, &lp->flags);
-
- if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
- !test_bit(LFL_NOBLOCK, &lp->flags))
- gdlm_queue_delayed(lp);
- else
- queue_submit(lp);
- return;
- }
-
- /*
- * A request is granted during dlm recovery. It may be granted
- * because the locks of a failed node were cleared. In that case,
- * there may be inconsistent data beneath this lock and we must wait
- * for recovery to complete to use it. When gfs recovery is done this
- * granted lock will be converted to NL and then reacquired in this
- * granted state.
- */
-
- if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
- !test_bit(LFL_NOBLOCK, &lp->flags) &&
- lp->req != DLM_LOCK_NL) {
-
- lp->cur = lp->req;
- lp->prev_req = lp->req;
- lp->req = DLM_LOCK_NL;
- lp->lkf |= DLM_LKF_CONVERT;
- lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
- log_debug("rereq %x,%llx id %x %d,%d",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->lksb.sb_lkid, lp->cur, lp->req);
-
- set_bit(LFL_REREQUEST, &lp->flags);
- queue_submit(lp);
- return;
- }
-
- /*
- * DLM demoted the lock to NL before it was granted so GFS must be
- * told it cannot cache data for this lock.
- */
-
- if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
- set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
- /*
- * This is an internal lock_dlm lock
- */
-
- if (test_bit(LFL_INLOCK, &lp->flags)) {
- clear_bit(LFL_NOBLOCK, &lp->flags);
- lp->cur = lp->req;
- wake_up_ast(lp);
- return;
- }
-
- /*
- * Normal completion of a lock request. Tell GFS it now has the lock.
- */
-
- clear_bit(LFL_NOBLOCK, &lp->flags);
- lp->cur = lp->req;
-
- acb.lc_name = lp->lockname;
- acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
- if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
- (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
- acb.lc_ret |= LM_OUT_CACHEABLE;
-
- ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static inline int no_work(struct gdlm_ls *ls, int blocking)
+static inline int no_work(struct gdlm_ls *ls)
{
int ret;
spin_lock(&ls->async_lock);
- ret = list_empty(&ls->complete) && list_empty(&ls->submit);
- if (ret && blocking)
- ret = list_empty(&ls->blocking);
+ ret = list_empty(&ls->submit);
spin_unlock(&ls->async_lock);
return ret;
}
-static inline int check_drop(struct gdlm_ls *ls)
-{
- if (!ls->drop_locks_count)
- return 0;
-
- if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
- ls->drop_time = jiffies;
- if (ls->all_locks_count >= ls->drop_locks_count)
- return 1;
- }
- return 0;
-}
-
-static int gdlm_thread(void *data, int blist)
+static int gdlm_thread(void *data)
{
struct gdlm_ls *ls = (struct gdlm_ls *) data;
struct gdlm_lock *lp = NULL;
- uint8_t complete, blocking, submit, drop;
-
- /* Only thread1 is allowed to do blocking callbacks since gfs
- may wait for a completion callback within a blocking cb. */
while (!kthread_should_stop()) {
wait_event_interruptible(ls->thread_wait,
- !no_work(ls, blist) || kthread_should_stop());
-
- complete = blocking = submit = drop = 0;
+ !no_work(ls) || kthread_should_stop());
spin_lock(&ls->async_lock);
- if (blist && !list_empty(&ls->blocking)) {
- lp = list_entry(ls->blocking.next, struct gdlm_lock,
- blist);
- list_del_init(&lp->blist);
- blocking = lp->bast_mode;
- lp->bast_mode = 0;
- } else if (!list_empty(&ls->complete)) {
- lp = list_entry(ls->complete.next, struct gdlm_lock,
- clist);
- list_del_init(&lp->clist);
- complete = 1;
- } else if (!list_empty(&ls->submit)) {
+ if (!list_empty(&ls->submit)) {
lp = list_entry(ls->submit.next, struct gdlm_lock,
delay_list);
list_del_init(&lp->delay_list);
- submit = 1;
+ spin_unlock(&ls->async_lock);
+ gdlm_do_lock(lp);
+ spin_lock(&ls->async_lock);
}
-
- drop = check_drop(ls);
spin_unlock(&ls->async_lock);
-
- if (complete)
- process_complete(lp);
-
- else if (blocking)
- process_blocking(lp, blocking);
-
- else if (submit)
- gdlm_do_lock(lp);
-
- if (drop)
- ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-
- schedule();
}
return 0;
}
-static int gdlm_thread1(void *data)
-{
- return gdlm_thread(data, 1);
-}
-
-static int gdlm_thread2(void *data)
-{
- return gdlm_thread(data, 0);
-}
-
int gdlm_init_threads(struct gdlm_ls *ls)
{
struct task_struct *p;
int error;
- p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
- error = IS_ERR(p);
- if (error) {
- log_error("can't start lock_dlm1 thread %d", error);
- return error;
- }
- ls->thread1 = p;
-
- p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
+ p = kthread_run(gdlm_thread, ls, "lock_dlm");
error = IS_ERR(p);
if (error) {
- log_error("can't start lock_dlm2 thread %d", error);
- kthread_stop(ls->thread1);
+ log_error("can't start lock_dlm thread %d", error);
return error;
}
- ls->thread2 = p;
+ ls->thread = p;
return 0;
}
void gdlm_release_threads(struct gdlm_ls *ls)
{
- kthread_stop(ls->thread1);
- kthread_stop(ls->thread2);
+ kthread_stop(ls->thread);
}
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a..00000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
-
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d9..00000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-
-struct nolock_lockspace {
- unsigned int nl_lvb_size;
-};
-
-static const struct lm_lockops nolock_ops;
-
-static int nolock_mount(char *table_name, char *host_data,
- lm_callback_t cb, void *cb_data,
- unsigned int min_lvb_size, int flags,
- struct lm_lockstruct *lockstruct,
- struct kobject *fskobj)
-{
- char *c;
- unsigned int jid;
- struct nolock_lockspace *nl;
-
- c = strstr(host_data, "jid=");
- if (!c)
- jid = 0;
- else {
- c += 4;
- sscanf(c, "%u", &jid);
- }
-
- nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
- if (!nl)
- return -ENOMEM;
-
- nl->nl_lvb_size = min_lvb_size;
-
- lockstruct->ls_jid = jid;
- lockstruct->ls_first = 1;
- lockstruct->ls_lvb_size = min_lvb_size;
- lockstruct->ls_lockspace = nl;
- lockstruct->ls_ops = &nolock_ops;
- lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
- return 0;
-}
-
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-
-static void nolock_unmount(void *lockspace)
-{
- struct nolock_lockspace *nl = lockspace;
- kfree(nl);
-}
-
-static void nolock_withdraw(void *lockspace)
-{
-}
-
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
- void **lockp)
-{
- *lockp = lockspace;
- return 0;
-}
-
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-
-static void nolock_put_lock(void *lock)
-{
-}
-
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
- unsigned int req_state, unsigned int flags)
-{
- return req_state | LM_OUT_CACHEABLE;
-}
-
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
- return 0;
-}
-
-static void nolock_cancel(void *lock)
-{
-}
-
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
- struct nolock_lockspace *nl = lock;
- int error = 0;
-
- *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
- if (!*lvbp)
- error = -ENOMEM;
-
- return error;
-}
-
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
- kfree(lvb);
-}
-
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- posix_test_lock(file, fl);
-
- return 0;
-}
-
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
- struct file *file, int cmd, struct file_lock *fl)
-{
- int error;
- error = posix_lock_file_wait(file, fl);
- return error;
-}
-
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- int error;
- error = posix_lock_file_wait(file, fl);
- return error;
-}
-
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
- unsigned int message)
-{
-}
-
-static const struct lm_lockops nolock_ops = {
- .lm_proto_name = "lock_nolock",
- .lm_mount = nolock_mount,
- .lm_others_may_mount = nolock_others_may_mount,
- .lm_unmount = nolock_unmount,
- .lm_withdraw = nolock_withdraw,
- .lm_get_lock = nolock_get_lock,
- .lm_put_lock = nolock_put_lock,
- .lm_lock = nolock_lock,
- .lm_unlock = nolock_unlock,
- .lm_cancel = nolock_cancel,
- .lm_hold_lvb = nolock_hold_lvb,
- .lm_unhold_lvb = nolock_unhold_lvb,
- .lm_plock_get = nolock_plock_get,
- .lm_plock = nolock_plock,
- .lm_punlock = nolock_punlock,
- .lm_recovery_done = nolock_recovery_done,
- .lm_owner = THIS_MODULE,
-};
-
-static int __init init_nolock(void)
-{
- int error;
-
- error = gfs2_register_lockproto(&nolock_ops);
- if (error) {
- printk(KERN_WARNING
- "lock_nolock: can't register protocol: %d\n", error);
- return error;
- }
-
- printk(KERN_INFO
- "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
- return 0;
-}
-
-static void __exit exit_nolock(void)
-{
- gfs2_unregister_lockproto(&nolock_ops);
-}
-
-module_init(init_nolock);
-module_exit(exit_nolock);
-
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836..6c6af9f5e3a 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
*/
static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 77115281650..7c64510ccfd 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
*/
static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
{
spin_lock(&sdp->sd_log_lock);
}
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
*/
static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
{
spin_unlock(&sdp->sd_log_lock);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd5..bcc668d0fad 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
INIT_HLIST_NODE(&gl->gl_list);
spin_lock_init(&gl->gl_spin);
INIT_LIST_HEAD(&gl->gl_holders);
- INIT_LIST_HEAD(&gl->gl_waiters1);
- INIT_LIST_HEAD(&gl->gl_waiters3);
gl->gl_lvb = NULL;
atomic_set(&gl->gl_lvb_count, 0);
INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f8..09853620c95 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
}
/**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
* @blkno: the block number (filesystem scope)
* @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
* Returns: the buffer
*/
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gl->gl_aspace->i_mapping;
struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
{
struct buffer_head *bh;
- bh = getbuf(gl, blkno, CREATE);
+ bh = gfs2_getbuf(gl, blkno, CREATE);
meta_prep_new(bh);
return bh;
}
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head **bhp)
{
- *bhp = getbuf(gl, blkno, CREATE);
+ *bhp = gfs2_getbuf(gl, blkno, CREATE);
if (!buffer_uptodate(*bhp)) {
ll_rw_block(READ_META, 1, bhp);
if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
while (blen) {
- bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+ bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
if (bh) {
lock_buffer(bh);
gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (extlen > max_ra)
extlen = max_ra;
- first_bh = getbuf(gl, dblock, CREATE);
+ first_bh = gfs2_getbuf(gl, dblock, CREATE);
if (buffer_uptodate(first_bh))
goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
extlen--;
while (extlen) {
- bh = getbuf(gl, dblock, CREATE);
+ bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe..b1a5f3674d4 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
int flags, struct buffer_head **bhp);
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb..e64a1b04117 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
* @file: The file to read
* @page: The page of the file
*
- * This deals with the locking required. We use a trylock in order to
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
- * in the event that we are unable to get the lock.
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
*/
static int gfs2_readpage(struct file *file, struct page *page)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
- struct gfs2_holder *gh;
+ struct address_space *mapping = page->mapping;
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_holder gh;
int error;
- gh = gfs2_glock_is_locked_by_me(ip->i_gl);
- if (!gh) {
- gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
- if (!gh)
- return -ENOBUFS;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+ unlock_page(page);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+ error = gfs2_glock_nq_atime(&gh);
+ if (unlikely(error))
+ goto out;
+ error = AOP_TRUNCATED_PAGE;
+ lock_page(page);
+ if (page->mapping == mapping && !PageUptodate(page))
+ error = __gfs2_readpage(file, page);
+ else
unlock_page(page);
- error = gfs2_glock_nq_atime(gh);
- if (likely(error != 0))
- goto out;
- return AOP_TRUNCATED_PAGE;
- }
- error = __gfs2_readpage(file, page);
- gfs2_glock_dq(gh);
+ gfs2_glock_dq(&gh);
out:
- gfs2_holder_uninit(gh);
- kfree(gh);
+ gfs2_holder_uninit(&gh);
+ if (error && error != AOP_TRUNCATED_PAGE)
+ lock_page(page);
return error;
}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a06..e9a366d4411 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
#include <linux/uio.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
#include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
gfs2_glock_dq_uninit(&i_gh);
}
} else
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
return error;
}
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
[7] = GFS2_DIF_NOATIME,
[12] = GFS2_DIF_EXHASH,
[14] = GFS2_DIF_INHERIT_JDATA,
- [20] = GFS2_DIF_INHERIT_DIRECTIO,
};
static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
[gfs2fl_AppendOnly] = FS_APPEND_FL,
[gfs2fl_NoAtime] = FS_NOATIME_FL,
[gfs2fl_ExHash] = FS_INDEX_FL,
- [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
};
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
return error;
fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
- if (!S_ISDIR(inode->i_mode)) {
- if (ip->i_di.di_flags & GFS2_DIF_JDATA)
- fsflags |= FS_JOURNAL_DATA_FL;
- if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
- fsflags |= FS_DIRECTIO_FL;
- }
+ if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+ fsflags |= FS_JOURNAL_DATA_FL;
if (put_user(fsflags, ptr))
error = -EFAULT;
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
/* Flags that can be set by user space */
#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
- GFS2_DIF_DIRECTIO| \
GFS2_DIF_IMMUTABLE| \
GFS2_DIF_APPENDONLY| \
GFS2_DIF_NOATIME| \
GFS2_DIF_SYNC| \
GFS2_DIF_SYSTEM| \
- GFS2_DIF_INHERIT_DIRECTIO| \
GFS2_DIF_INHERIT_JDATA)
/**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
int error;
u32 new_flags, flags;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ error = mnt_want_write(filp->f_path.mnt);
if (error)
return error;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (error)
+ goto out_drop_write;
+
flags = ip->i_di.di_flags;
new_flags = (flags & ~mask) | (reqflags & mask);
if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
!capable(CAP_LINUX_IMMUTABLE))
goto out;
if (!IS_IMMUTABLE(inode)) {
- error = permission(inode, MAY_WRITE, NULL);
+ error = gfs2_permission(inode, MAY_WRITE);
if (error)
goto out;
}
@@ -272,6 +269,8 @@ out_trans_end:
gfs2_trans_end(sdp);
out:
gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+ mnt_drop_write(filp->f_path.mnt);
return error;
}
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
if (!S_ISDIR(inode->i_mode)) {
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
- gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
return do_gfs2_set_flags(filp, gfsflags, ~0);
}
return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
goto fail_gunlock;
}
- /* Listen to the Direct I/O flag */
-
- if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
- file->f_flags |= O_DIRECT;
-
gfs2_glock_dq_uninit(&i_gh);
}
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int error = 0;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE
- | GL_FLOCK;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
mutex_lock(&fp->f_fl_mutex);
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
gfs2_glock_dq_wait(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
- error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
- ip->i_no_addr, &gfs2_flock_glops,
- CREATE, &gl);
+ error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+ &gfs2_flock_glops, CREATE, &gl);
if (error)
goto out;
gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d..b4d1d649063 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_rindex_mutex);
INIT_LIST_HEAD(&sdp->sd_rindex_list);
INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
- INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
INIT_LIST_HEAD(&sdp->sd_jindex_list);
spin_lock_init(&sdp->sd_jindex_spin);
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
{
+ if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+ return;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
goto out;
}
- if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
- gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+ if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
GFS2_MIN_LVB_SIZE)) {
gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
fail_locking:
init_locking(sdp, &mount_gh, UNDO);
fail_lm:
- gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_gl_hash_clear(sdp);
gfs2_lm_unmount(sdp);
while (invalidate_inodes(sb))
yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c002..1e252dfc529 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (error)
goto out;
- error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_gunlock;
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
}
}
} else {
- error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_gunlock;
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
/* Check out the dir to be renamed */
if (dir_rename) {
- error = permission(odentry->d_inode, MAY_WRITE, NULL);
+ error = gfs2_permission(odentry->d_inode, MAY_WRITE);
if (error)
goto out_gunlock;
}
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
* Returns: errno
*/
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
unlock = 1;
}
- error = generic_permission(inode, mask, gfs2_check_acl);
+ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+ error = -EACCES;
+ else
+ error = generic_permission(inode, mask, gfs2_check_acl);
if (unlock)
gfs2_glock_dq_uninit(&i_gh);
return error;
}
+static int gfs2_iop_permission(struct inode *inode, int mask,
+ struct nameidata *nd)
+{
+ return gfs2_permission(inode, mask);
+}
+
static int setattr_size(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
}
const struct inode_operations gfs2_file_iops = {
- .permission = gfs2_permission,
+ .permission = gfs2_iop_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
.setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
.rmdir = gfs2_rmdir,
.mknod = gfs2_mknod,
.rename = gfs2_rename,
- .permission = gfs2_permission,
+ .permission = gfs2_iop_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
.setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
const struct inode_operations gfs2_symlink_iops = {
.readlink = gfs2_readlink,
.follow_link = gfs2_follow_link,
- .permission = gfs2_permission,
+ .permission = gfs2_iop_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
.setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb8..f66ea0f7a35 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
gfs2_clear_rgrpd(sdp);
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
- gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_gl_hash_clear(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
static int gfs2_sync_fs(struct super_block *sb, int wait)
{
sb->s_dirt = 0;
- if (wait)
+ if (wait && sb->s_fs_info)
gfs2_log_flush(sb->s_fs_info, NULL);
return 0;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59..3e073f5144f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
do_sync = 0;
else {
value *= gfs2_jindex_size(sdp) * num;
- do_div(value, den);
+ value = div_s64(value, den);
value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c..d5e91f4f6a0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
unsigned int message)
{
+ if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+ return;
+
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
sdp->sd_lockstruct.ls_ops->lm_recovery_done(
sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
- GL_NOCANCEL | GL_NOCACHE, &t_gh);
+ GL_NOCACHE, &t_gh);
if (error)
goto fail_gunlock_ji;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742..2d90fb25350 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_rindex_spin);
sdp->sd_rindex_forward = NULL;
- head = &sdp->sd_rindex_recent_list;
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
- list_del(&rgd->rd_recent);
- }
spin_unlock(&sdp->sd_rindex_spin);
head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
}
/**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
- u64 rglast)
-{
- struct gfs2_rgrpd *rgd;
-
- spin_lock(&sdp->sd_rindex_spin);
-
- if (rglast) {
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgrp_contains_block(rgd, rglast))
- goto out;
- }
- }
- rgd = NULL;
- if (!list_empty(&sdp->sd_rindex_recent_list))
- rgd = list_entry(sdp->sd_rindex_recent_list.next,
- struct gfs2_rgrpd, rd_recent);
-out:
- spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
-}
-
-/**
* recent_rgrp_next - get next RG from "recent" list
* @cur_rgd: current rgrp
- * @remove:
*
* Returns: The next rgrp in the recent list
*/
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
- int remove)
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
{
struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
struct list_head *head;
struct gfs2_rgrpd *rgd;
spin_lock(&sdp->sd_rindex_spin);
-
- head = &sdp->sd_rindex_recent_list;
-
- list_for_each_entry(rgd, head, rd_recent) {
- if (rgd == cur_rgd) {
- if (cur_rgd->rd_recent.next != head)
- rgd = list_entry(cur_rgd->rd_recent.next,
- struct gfs2_rgrpd, rd_recent);
- else
- rgd = NULL;
-
- if (remove)
- list_del(&cur_rgd->rd_recent);
-
- goto out;
- }
+ head = &sdp->sd_rindex_mru_list;
+ if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+ spin_unlock(&sdp->sd_rindex_spin);
+ return NULL;
}
-
- rgd = NULL;
- if (!list_empty(head))
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-
-out:
+ rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
spin_unlock(&sdp->sd_rindex_spin);
return rgd;
}
/**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
- struct gfs2_sbd *sdp = new_rgd->rd_sbd;
- struct gfs2_rgrpd *rgd;
- unsigned int count = 0;
- unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-
- spin_lock(&sdp->sd_rindex_spin);
-
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgd == new_rgd)
- goto out;
-
- if (++count >= max)
- goto out;
- }
- list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-
-out:
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
-/**
* forward_rgrp_get - get an rgrp to try next from full list
* @sdp: The GFS2 superblock
*
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
int loops = 0;
int error, rg_locked;
- /* Try recently successful rgrps */
-
- rgd = recent_rgrp_first(sdp, ip->i_goal);
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
while (rgd) {
rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
return inode;
- rgd = recent_rgrp_next(rgd, 1);
- break;
-
+ /* fall through */
case GLR_TRYFAILED:
- rgd = recent_rgrp_next(rgd, 0);
+ rgd = recent_rgrp_next(rgd);
break;
default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
out:
if (begin) {
- recent_rgrp_add(rgd);
+ spin_lock(&sdp->sd_rindex_spin);
+ list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+ spin_unlock(&sdp->sd_rindex_spin);
rgd = gfs2_rgrpd_get_next(rgd);
if (!rgd)
rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f3..63a8a902d9d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
gt->gt_quota_quantum = 60;
gt->gt_atime_quantum = 3600;
gt->gt_new_files_jdata = 0;
- gt->gt_new_files_directio = 0;
gt->gt_max_readahead = 1 << 18;
gt->gt_stall_secs = 600;
gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
}
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
- LM_FLAG_PRIORITY | GL_NOCACHE,
- t_gh);
+ GL_NOCACHE, t_gh);
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd..74846559fc3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
return len;
}
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (simple_strtol(buf, NULL, 0) != 1)
- return -EINVAL;
-
- gfs2_gl_hash_clear(sdp, NO_WAIT);
- return len;
-}
-
static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
GFS2_ATTR(id, 0444, id_show, NULL);
GFS2_ATTR(fsname, 0444, fsname_show, NULL);
GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
-GFS2_ATTR(shrink, 0200, NULL, shrink_store);
GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
&gfs2_attr_id.attr,
&gfs2_attr_fsname.attr,
&gfs2_attr_freeze.attr,
- &gfs2_attr_shrink.attr,
&gfs2_attr_withdraw.attr,
&gfs2_attr_statfs_sync.attr,
&gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
TUNE_ATTR(complain_secs, 0);
TUNE_ATTR(statfs_slow, 0);
TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
TUNE_ATTR(quota_simul_sync, 1);
TUNE_ATTR(quota_cache_secs, 1);
TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
&tune_attr_quotad_secs.attr,
&tune_attr_quota_scale.attr,
&tune_attr_new_files_jdata.attr,
- &tune_attr_new_files_directio.attr,
NULL,
};
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022c..91389c8aee8 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7cee..f8b3be87322 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
}
/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head *bh;
int ret;
int barrier_done = 0;
+ struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
if (JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
*/
-static int journal_wait_on_locked_list(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
{
- int ret = 0;
- struct journal_head *jh;
-
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- ret = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- jbd2_journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ ret = generic_writepages(mapping, &wbc);
return ret;
- }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd2_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd2_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
} else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- jbd2_journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * jbd2_journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
- }
-
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ jinode->i_transaction = NULL;
}
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete if commit
- * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
-
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -748,15 +660,19 @@ start_journal_io:
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
-
- spin_lock(&journal->j_list_lock);
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
- spin_unlock(&journal->j_list_lock);
- if (err)
- __jbd2_journal_abort_hard(journal);
}
+ /*
+ * This is the right place to wait for data buffers both for ASYNC
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+ * the commit block went to disk (which happens above). If commit is
+ * SYNC, we need to wait for data buffers before we start writing
+ * commit block, which happens below in such setting.
+ */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err)
+ jbd2_journal_abort(journal, err);
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -768,7 +684,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +743,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -854,7 +770,7 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +790,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -997,7 +913,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a7..b26c6d9fe6a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ int writeout = 0;
+
+ if (!journal)
+ return;
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e6780..4f7cadbb19f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
- * Called under j_state_lock
*/
static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
}
/**
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = jbd2_journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls jbd2_journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __jbd2_journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __jbd2_journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- jbd2_journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of these pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __jbd2_journal_unfile_buffer(jh);
- jbd2_journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return -EIO;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ /*
+ * First check whether inode isn't already on the transaction's
+ * lists without taking the lock. Note that this check is safe
+ * without the lock as we cannot race with somebody removing inode
+ * from the transaction. The reason is that we remove inode from the
+ * transaction only in journal_release_jbd_inode() and when we commit
+ * the transaction. We are guarded from the first case by holding
+ * a reference to the inode. We are safe against the second case
+ * because if jinode->i_transaction == transaction, commit code
+ * cannot touch the transaction because we hold reference to it,
+ * and if jinode->i_next_transaction == transaction, commit code
+ * will only file the inode where we want it.
+ */
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86e..6a73de84bce 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <asm/uaccess.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
@@ -30,29 +31,19 @@
static struct proc_dir_entry *base;
#ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
{
- int len;
-
- len = sprintf(page, "%d\n", jfsloglevel);
-
- len -= off;
- *start = page + off;
-
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+ seq_printf(m, "%d\n", jfsloglevel);
+ return 0;
+}
- return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_loglevel_proc_show, NULL);
}
-static int loglevel_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+static ssize_t jfs_loglevel_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
jfsloglevel = c - '0';
return count;
}
+
+static const struct file_operations jfs_loglevel_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_loglevel_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = jfs_loglevel_proc_write,
+};
#endif
static struct {
const char *name;
- read_proc_t *read_fn;
- write_proc_t *write_fn;
+ const struct file_operations *proc_fops;
} Entries[] = {
#ifdef CONFIG_JFS_STATISTICS
- { "lmstats", jfs_lmstats_read, },
- { "txstats", jfs_txstats_read, },
- { "xtstat", jfs_xtstat_read, },
- { "mpstat", jfs_mpstat_read, },
+ { "lmstats", &jfs_lmstats_proc_fops, },
+ { "txstats", &jfs_txstats_proc_fops, },
+ { "xtstat", &jfs_xtstat_proc_fops, },
+ { "mpstat", &jfs_mpstat_proc_fops, },
#endif
#ifdef CONFIG_JFS_DEBUG
- { "TxAnchor", jfs_txanchor_read, },
- { "loglevel", loglevel_read, loglevel_write }
+ { "TxAnchor", &jfs_txanchor_proc_fops, },
+ { "loglevel", &jfs_loglevel_proc_fops }
#endif
};
#define NPROCENT ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
return;
base->owner = THIS_MODULE;
- for (i = 0; i < NPROCENT; i++) {
- struct proc_dir_entry *p;
- if ((p = create_proc_entry(Entries[i].name, 0, base))) {
- p->read_proc = Entries[i].read_fn;
- p->write_proc = Entries[i].write_fn;
- }
- }
+ for (i = 0; i < NPROCENT; i++)
+ proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
}
void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc..eafd1300a00 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
extern int jfsloglevel;
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
/* information message: e.g., configuration, major event */
#define jfs_info(fmt, arg...) do { \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
* ----------
*/
#ifdef CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
+extern const struct file_operations jfs_txstats_proc_fops;
+extern const struct file_operations jfs_mpstat_proc_fops;
+extern const struct file_operations jfs_xtstat_proc_fops;
#define INCREMENT(x) ((x)++)
#define DECREMENT(x) ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafe..2545bb31723 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
#define JFS_REMOVE 3
#define JFS_RENAME 4
-#define DIRENTSIZ(namlen) \
- ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
-
/*
* Maximum file offset for directories.
*/
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916bea..d6363d8309d 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
jfs_error(ip->i_sb,
"diAlloc: can't find free bit "
"in wmap");
- return EIO;
+ return -EIO;
}
/* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95..cd2ec2988b5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/mutex.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Logmgr stats\n"
"================\n"
"commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
lmStat.pagedone,
lmStat.full_page,
lmStat.partial_page);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_lmstats_proc_show, NULL);
}
+
+const struct file_operations jfs_lmstats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_lmstats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fc..854ff0ec574 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/bio.h>
#include <linux/init.h>
#include <linux/buffer_head.h>
#include <linux/mempool.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Metapage statistics\n"
"=======================\n"
"page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
mpStat.pagealloc,
mpStat.pagefree,
mpStat.lockwait);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_mpstat_proc_show, NULL);
}
+
+const struct file_operations jfs_mpstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_mpstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b..f26e4d03ada 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
}
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
char *freewait;
char *freelockwait;
char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
lowlockwait =
waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
- len += sprintf(buffer,
+ seq_printf(m,
"JFS TxAnchor\n"
"============\n"
"freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
TxAnchor.tlocksInUse,
jfs_tlocks_low,
list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_txanchor_proc_show, NULL);
}
+
+const struct file_operations jfs_txanchor_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_txanchor_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS TxStats\n"
"===========\n"
"calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
TxStat.txBeginAnon_lockslow,
TxStat.txLockAlloc,
TxStat.txLockAlloc_freelock);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_txstats_proc_show, NULL);
}
+
+const struct file_operations jfs_txstats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_txstats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbc..ae3acafb447 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
*/
#include <linux/fs.h>
+#include <linux/module.h>
#include <linux/quotaops.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Xtree statistics\n"
"====================\n"
"searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
xtStat.search,
xtStat.fastSearch,
xtStat.split);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_xtstat_proc_show, NULL);
}
+
+const struct file_operations jfs_xtstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_xtstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa..2aba8238681 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
free_UCSname(&key);
if (rc == -ENOENT) {
d_add(dentry, NULL);
- return ERR_PTR(0);
+ return NULL;
} else if (rc) {
jfs_err("jfs_lookup: dtSearch returned %d", rc);
return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea6545173..0288e6d7936 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -499,7 +499,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
inode = jfs_iget(sb, ROOT_I);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- goto out_no_root;
+ goto out_no_rw;
}
sb->s_root = d_alloc_root(inode);
if (!sb->s_root)
@@ -521,9 +521,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
out_no_root:
- jfs_err("jfs_read_super: get root inode failed");
- if (inode)
- iput(inode);
+ jfs_err("jfs_read_super: get root dentry failed");
+ iput(inode);
out_no_rw:
rc = jfs_umount(sb);
diff --git a/fs/libfs.c b/fs/libfs.c
index 892d41cb338..baeb71ee1cd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
mntput(mnt);
}
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
@@ -528,6 +542,20 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
return count;
}
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3..1f6dc518505 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
static void nlmclnt_rpc_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
* Report the conflicting lock back to the application.
*/
fl->fl_start = req->a_res.lock.fl.fl_start;
- fl->fl_end = req->a_res.lock.fl.fl_start;
+ fl->fl_end = req->a_res.lock.fl.fl_end;
fl->fl_type = req->a_res.lock.fl.fl_type;
fl->fl_pid = 0;
break;
@@ -710,7 +712,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
die:
return;
retry_rebind:
+ lock_kernel();
nlm_rebind_host(req->a_host);
+ unlock_kernel();
retry_unlock:
rpc_restart_call(task);
}
@@ -788,7 +792,9 @@ retry_cancel:
/* Don't ever retry more than 3 times */
if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
goto die;
+ lock_kernel();
nlm_rebind_host(req->a_host);
+ unlock_kernel();
rpc_restart_call(task);
rpc_delay(task, 30 * HZ);
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387..2e27176ff42 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -248,7 +248,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
static void nlm4svc_callback_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static const struct rpc_call_ops nlm4svc_callback_ops = {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfd..56a08ab9a4c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -795,6 +795,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
dprintk("lockd: GRANT_MSG RPC callback\n");
+ lock_kernel();
/* if the block is not on a list at this point then it has
* been invalidated. Don't try to requeue it.
*
@@ -804,7 +805,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
* for nlm_blocked?
*/
if (list_empty(&block->b_list))
- return;
+ goto out;
/* Technically, we should down the file semaphore here. Since we
* move the block towards the head of the queue only, no harm
@@ -818,13 +819,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
}
nlmsvc_insert_block(block, timeout);
svc_wake_up(block->b_daemon);
+out:
+ unlock_kernel();
}
static void nlmsvc_grant_release(void *data)
{
struct nlm_rqst *call = data;
+ lock_kernel();
nlmsvc_release_block(call->a_block);
+ unlock_kernel();
}
static const struct rpc_call_ops nlmsvc_grant_ops = {
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b..ce6952b50a7 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -278,7 +278,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
static void nlmsvc_callback_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static const struct rpc_call_ops nlmsvc_callback_ops = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a..dbcc7af76a1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
+EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-struct mpage_data {
- struct bio *bio;
- sector_t last_block_in_bio;
- get_block_t *get_block;
- unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
mpd->bio = bio;
return ret;
}
+EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d702..1f7f2956412 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
dentry->d_op = &msdos_dentry_operations;
- lock_kernel();
+ lock_super(sb);
res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (res == -ENOENT)
goto add;
@@ -232,7 +232,7 @@ add:
if (dentry)
dentry->d_op = &msdos_dentry_operations;
out:
- unlock_kernel();
+ unlock_super(sb);
if (!res)
return dentry;
return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
unsigned char msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
err = fat_flush_inodes(sb, dir, inode);
return err;
@@ -324,11 +324,12 @@ out:
/***** Remove a directory */
static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
{
+ struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
/*
* Check whether the directory is not in use, then check
* whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, is_hid, cluster;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
fat_flush_inodes(sb, dir, inode);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -419,10 +420,11 @@ out:
static int msdos_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb= inode->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (err)
goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -618,10 +620,11 @@ error_inode:
static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ struct super_block *sb = old_dir->i_sb;
unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(old_dentry->d_name.name,
old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
new_dir, new_msdos_name, new_dentry, is_hid);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+ err = fat_flush_inodes(sb, old_dir, new_dir);
return err;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e..4f6f7635b59 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
const char *str;
};
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{
static const struct proc_fs_info fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
if (sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
+
+ return security_sb_show_options(m, sb);
}
static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
seq_putc(m, ' ');
show_type(m, mnt->mnt_sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
- show_sb_opts(m, mnt->mnt_sb);
+ err = show_sb_opts(m, mnt->mnt_sb);
+ if (err)
+ goto out;
show_mnt_opts(m, mnt);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
+out:
return err;
}
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
seq_putc(m, ' ');
mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
- show_sb_opts(m, sb);
+ err = show_sb_opts(m, sb);
+ if (err)
+ goto out;
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt);
seq_putc(m, '\n');
+out:
return err;
}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b3..6a7d901f193 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/ncp_fs.h>
#include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
return 0;
}
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations ncp_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = ncp_remote_llseek,
.read = ncp_file_read,
.write = ncp_file_write,
.ioctl = ncp_ioctl,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c830062..f447f4b4476 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
struct nfs_callback_data {
unsigned int users;
- struct svc_serv *serv;
+ struct svc_rqst *rqst;
struct task_struct *task;
};
@@ -91,21 +91,17 @@ nfs_callback_svc(void *vrqstp)
svc_process(rqstp);
}
unlock_kernel();
- nfs_callback_info.task = NULL;
- svc_exit_thread(rqstp);
return 0;
}
/*
- * Bring up the server process if it is not already up.
+ * Bring up the callback thread if it is not already up.
*/
int nfs_callback_up(void)
{
struct svc_serv *serv = NULL;
- struct svc_rqst *rqstp;
int ret = 0;
- lock_kernel();
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
@@ -121,22 +117,23 @@ int nfs_callback_up(void)
nfs_callback_tcpport = ret;
dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
- if (IS_ERR(rqstp)) {
- ret = PTR_ERR(rqstp);
+ nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ if (IS_ERR(nfs_callback_info.rqst)) {
+ ret = PTR_ERR(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
goto out_err;
}
svc_sock_update_bufs(serv);
- nfs_callback_info.serv = serv;
- nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp,
+ nfs_callback_info.task = kthread_run(nfs_callback_svc,
+ nfs_callback_info.rqst,
"nfsv4-svc");
if (IS_ERR(nfs_callback_info.task)) {
ret = PTR_ERR(nfs_callback_info.task);
- nfs_callback_info.serv = NULL;
+ svc_exit_thread(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
nfs_callback_info.task = NULL;
- svc_exit_thread(rqstp);
goto out_err;
}
out:
@@ -149,7 +146,6 @@ out:
if (serv)
svc_destroy(serv);
mutex_unlock(&nfs_callback_mutex);
- unlock_kernel();
return ret;
out_err:
dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +155,19 @@ out_err:
}
/*
- * Kill the server process if it is not already down.
+ * Kill the callback thread if it's no longer being used.
*/
void nfs_callback_down(void)
{
- lock_kernel();
mutex_lock(&nfs_callback_mutex);
nfs_callback_info.users--;
- if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
+ if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
kthread_stop(nfs_callback_info.task);
+ svc_exit_thread(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
+ nfs_callback_info.task = NULL;
+ }
mutex_unlock(&nfs_callback_mutex);
- unlock_kernel();
}
static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b..5ee23e7058b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
{
to->to_initval = timeo * HZ / 10;
to->to_retries = retrans;
- if (!to->to_retries)
- to->to_retries = 2;
switch (proto) {
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_RDMA:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_TCP_RETRANS;
if (to->to_initval == 0)
- to->to_initval = 60 * HZ;
+ to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
to->to_exponential = 0;
break;
case XPRT_TRANSPORT_UDP:
- default:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_UDP_RETRANS;
if (!to->to_initval)
- to->to_initval = 11 * HZ / 10;
+ to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
to->to_initval = NFS_MAX_UDP_TIMEOUT;
to->to_maxval = NFS_MAX_UDP_TIMEOUT;
to->to_exponential = 1;
break;
+ default:
+ BUG();
}
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58d43daec08..28a238dab23 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
{
int res;
- dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
- inode->i_sb->s_id, inode->i_ino);
+ dfprintk(FILE, "NFS: open dir(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- lock_kernel();
/* Call generic open code in order to cache credentials */
res = nfs_open(inode, filp);
- unlock_kernel();
return res;
}
@@ -204,7 +205,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
* Note: assumes we have exclusive access to this mapping either
* through inode->i_mutex or some other mechanism.
*/
- if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+ if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
/* Should never happen */
nfs_zap_mapping(inode, inode->i_mapping);
}
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct nfs_fattr fattr;
long res;
- dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n",
+ dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(long long)filp->f_pos);
nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
- lock_kernel();
-
/*
* filp->f_pos points to the dirent entry number.
* *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
out:
nfs_unblock_sillyrename(dentry);
- unlock_kernel();
if (res > 0)
res = 0;
- dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n",
+ dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
res);
return res;
@@ -603,7 +601,15 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
{
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name,
+ offset, origin);
+
+ mutex_lock(&inode->i_mutex);
switch (origin) {
case 1:
offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
nfs_file_open_context(filp)->dir_cookie = 0;
}
out:
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
return offset;
}
@@ -629,10 +635,11 @@ out:
*/
static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
{
- dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
+ dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
datasync);
+ nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
return 0;
}
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
struct nfs_fattr fattr;
parent = dget_parent(dentry);
- lock_kernel();
dir = parent->d_inode;
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
- unlock_kernel();
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
__func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
shrink_dcache_parent(dentry);
}
d_drop(dentry);
- unlock_kernel();
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
}
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&inode->i_lock);
+}
+
/*
* Called when the dentry loses inode.
* We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
- lock_kernel();
drop_nlink(inode);
nfs_complete_unlink(dentry, inode);
- unlock_kernel();
}
iput(inode);
}
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
res = ERR_PTR(-ENOMEM);
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
- lock_kernel();
-
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
if (nfs_is_exclusive_create(dir, nd)) {
d_instantiate(dentry, NULL);
res = NULL;
- goto out_unlock;
+ goto out;
}
parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
-out_unlock:
- unlock_kernel();
out:
return res;
}
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
}
/* Open the file on the server */
- lock_kernel();
res = nfs4_atomic_open(dir, dentry, nd);
- unlock_kernel();
if (IS_ERR(res)) {
error = PTR_ERR(res);
switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
* operations that change the directory. We therefore save the
* change attribute *before* we do the RPC call.
*/
- lock_kernel();
ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
- unlock_kernel();
out:
dput(parent);
if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
if ((nd->flags & LOOKUP_CREATE) != 0)
open_flags = nd->intent.open.flags;
- lock_kernel();
error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
if (error != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
- unlock_kernel();
d_drop(dentry);
return error;
}
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
- lock_kernel();
status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
if (status != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
- unlock_kernel();
d_drop(dentry);
return status;
}
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
attr.ia_valid = ATTR_MODE;
attr.ia_mode = mode | S_IFDIR;
- lock_kernel();
error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
if (error != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
d_drop(dentry);
- unlock_kernel();
return error;
}
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
- lock_kernel();
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
/* Ensure the VFS deletes this inode */
if (error == 0 && dentry->d_inode != NULL)
clear_nlink(dentry->d_inode);
else if (error == -ENOENT)
nfs_dentry_handle_enoent(dentry);
- unlock_kernel();
return error;
}
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
/* The VFS may want to delete this inode */
if (error == 0)
- drop_nlink(inode);
+ nfs_drop_nlink(inode);
nfs_mark_for_revalidate(inode);
} else
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
dir->i_ino, dentry->d_name.name);
- lock_kernel();
spin_lock(&dcache_lock);
spin_lock(&dentry->d_lock);
if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
error = nfs_sillyrename(dir, dentry);
- unlock_kernel();
return error;
}
if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
} else if (need_rehash)
d_rehash(dentry);
- unlock_kernel();
return error;
}
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- lock_kernel();
-
page = alloc_page(GFP_HIGHUSER);
- if (!page) {
- unlock_kernel();
+ if (!page)
return -ENOMEM;
- }
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
dentry->d_name.name, symname, error);
d_drop(dentry);
__free_page(page);
- unlock_kernel();
return error;
}
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
} else
__free_page(page);
- unlock_kernel();
return 0;
}
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
- lock_kernel();
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
atomic_inc(&inode->i_count);
d_add(dentry, inode);
}
- unlock_kernel();
return error;
}
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* To prevent any new references to the target during the rename,
* we unhash the dentry and free the inode in advance.
*/
- lock_kernel();
if (!d_unhashed(new_dentry)) {
d_drop(new_dentry);
rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* dentry still busy? */
goto out;
} else
- drop_nlink(new_inode);
+ nfs_drop_nlink(new_inode);
go_ahead:
/*
@@ -1669,7 +1648,6 @@ out:
/* new dentry created? */
if (dentry)
dput(dentry);
- unlock_kernel();
return error;
}
@@ -1962,8 +1940,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
}
force_lookup:
- lock_kernel();
-
if (!NFS_PROTO(inode)->access)
goto out_notsup;
@@ -1973,7 +1949,6 @@ force_lookup:
put_rpccred(cred);
} else
res = PTR_ERR(cred);
- unlock_kernel();
out:
dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1957,6 @@ out_notsup:
res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (res == 0)
res = generic_permission(inode, mask, NULL);
- unlock_kernel();
goto out;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a..08f6b040d28 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
- dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
+ dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
- dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
+ dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32a..78460657f5c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static int nfs_file_flush(struct file *, fl_owner_t id);
-static int nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
static int nfs_check_flags(int flags);
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
- .fsync = nfs_fsync,
+ .fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
{
int res;
+ dprintk("NFS: open file(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
res = nfs_check_flags(filp->f_flags);
if (res)
return res;
nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- lock_kernel();
- res = NFS_PROTO(inode)->file_open(inode, filp);
- unlock_kernel();
+ res = nfs_open(inode, filp);
return res;
}
static int
nfs_file_release(struct inode *inode, struct file *filp)
{
+ struct dentry *dentry = filp->f_path.dentry;
+
+ dprintk("NFS: release(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+
/* Ensure that dirty pages are flushed out with the right creds */
if (filp->f_mode & FMODE_WRITE)
- nfs_wb_all(filp->f_path.dentry->d_inode);
+ nfs_wb_all(dentry->d_inode);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return NFS_PROTO(inode)->file_release(inode, filp);
+ return nfs_release(inode, filp);
}
/**
@@ -170,6 +178,13 @@ force_reval:
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
+ loff_t loff;
+
+ dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
+ offset, origin);
+
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
@@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(filp, offset, origin);
+ lock_kernel(); /* BKL needed? */
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
+ unlock_kernel();
+ return loff;
}
/*
- * Helper for nfs_file_flush() and nfs_fsync()
+ * Helper for nfs_file_flush() and nfs_file_fsync()
*
* Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
* disk, but it retrieves and clears ctx->error after synching, despite
@@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
/*
* Flush all dirty pages, and check for write errors.
- *
*/
static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
int status;
- dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: flush(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
@@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_read(iocb, iov, nr_segs, pos);
- dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+ dprintk("NFS: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long) pos);
@@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
struct inode *inode = dentry->d_inode;
ssize_t res;
- dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
+ dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long long) *ppos);
@@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
struct inode *inode = dentry->d_inode;
int status;
- dfprintk(VFS, "nfs: mmap(%s/%s)\n",
+ dprintk("NFS: mmap(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
* whether any write errors occurred for this process.
*/
static int
-nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = dentry->d_inode;
- dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
return nfs_do_fsync(ctx, inode);
@@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
index = pos >> PAGE_CACHE_SHIFT;
+ dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
page = __grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
int status;
- lock_kernel();
+ dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
+ /*
+ * Zero any uninitialised parts of the page, and then mark the page
+ * as up to date if it turns out that we're extending the file.
+ */
+ if (!PageUptodate(page)) {
+ unsigned pglen = nfs_page_length(page);
+ unsigned end = offset + len;
+
+ if (pglen == 0) {
+ zero_user_segments(page, 0, offset,
+ end, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ } else if (end >= pglen) {
+ zero_user_segment(page, end, PAGE_CACHE_SIZE);
+ if (offset == 0)
+ SetPageUptodate(page);
+ } else
+ zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ }
+
status = nfs_updatepage(file, page, offset, copied);
- unlock_kernel();
unlock_page(page);
page_cache_release(page);
@@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
static void nfs_invalidate_page(struct page *page, unsigned long offset)
{
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+
if (offset != 0)
return;
/* Cancel any unstarted writes on this page */
@@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
static int nfs_release_page(struct page *page, gfp_t gfp)
{
+ dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
/* If PagePrivate() is set, then the page is not freeable */
return 0;
}
static int nfs_launder_page(struct page *page)
{
- return nfs_wb_page(page->mapping->host, page);
+ struct inode *inode = page->mapping->host;
+
+ dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+ inode->i_ino, (long long)page_offset(page));
+
+ return nfs_wb_page(inode, page);
}
const struct address_space_operations nfs_file_aops = {
@@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
struct file *filp = vma->vm_file;
+ struct dentry *dentry = filp->f_path.dentry;
unsigned pagelen;
int ret = -EINVAL;
struct address_space *mapping;
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ filp->f_mapping->host->i_ino,
+ (long long)page_offset(page));
+
lock_page(page);
mapping = page->mapping;
- if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+ if (mapping != dentry->d_inode->i_mapping)
goto out_unlock;
ret = 0;
@@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_write(iocb, iov, nr_segs, pos);
- dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
+ dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- inode->i_ino, (unsigned long) count, (long long) pos);
+ (unsigned long) count, (long long) pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- nfs_zap_caches(inode);
+ if (!nfs_have_delegation(inode, FMODE_READ))
+ nfs_zap_caches(inode);
out:
return status;
}
@@ -592,23 +658,35 @@ out:
*/
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode * inode = filp->f_mapping->host;
+ struct inode *inode = filp->f_mapping->host;
+ int ret = -ENOLCK;
- dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
- inode->i_sb->s_id, inode->i_ino,
+ dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags,
(long long)fl->fl_start, (long long)fl->fl_end);
+
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
/* No mandatory locks over NFS */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
+ goto out_err;
+
+ if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+ ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+ if (ret < 0)
+ goto out_err;
+ }
if (IS_GETLK(cmd))
- return do_getlk(filp, cmd, fl);
- if (fl->fl_type == F_UNLCK)
- return do_unlk(filp, cmd, fl);
- return do_setlk(filp, cmd, fl);
+ ret = do_getlk(filp, cmd, fl);
+ else if (fl->fl_type == F_UNLCK)
+ ret = do_unlk(filp, cmd, fl);
+ else
+ ret = do_setlk(filp, cmd, fl);
+out_err:
+ return ret;
}
/*
@@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
*/
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
{
- dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
- filp->f_path.dentry->d_inode->i_sb->s_id,
- filp->f_path.dentry->d_inode->i_ino,
+ dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags);
/*
@@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
return do_setlk(filp, cmd, fl);
}
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
{
- /*
- * There is no protocol support for leases, so we have no way
- * to implement them correctly in the face of opens by other
- * clients.
- */
+ dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name, arg);
+
return -EINVAL;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f..df23f987da6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static void nfs_zap_acl_cache(struct inode *);
-
static struct kmem_cache * nfs_inode_cachep;
static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
}
}
-static void nfs_zap_acl_cache(struct inode *inode)
+void nfs_zap_acl_cache(struct inode *inode)
{
void (*clear_acl_cache)(struct inode *);
@@ -347,7 +345,7 @@ out_no_inode:
goto out;
}
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
int
nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
/* Optimization: if the end result is no change, don't RPC */
attr->ia_valid &= NFS_VALID_ATTRS;
- if (attr->ia_valid == 0)
+ if ((attr->ia_valid & ~ATTR_FILE) == 0)
return 0;
- lock_kernel();
/* Write all dirty data */
if (S_ISREG(inode->i_mode)) {
filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
if (error == 0)
nfs_refresh_inode(inode, &fattr);
- unlock_kernel();
return error;
}
/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+ if (i_size_read(inode) < offset) {
+ unsigned long limit;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, offset);
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ }
+ return 0;
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out_big:
+ return -EFBIG;
+}
+
+/**
* nfs_setattr_update_inode - Update inode metadata after a setattr call.
* @inode: pointer to struct inode
* @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
- inode->i_size = attr->ia_size;
- vmtruncate(inode, attr->ia_size);
+ nfs_vmtruncate(inode, attr->ia_size);
}
}
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- lock_kernel();
if (is_bad_inode(inode))
goto out_nowait;
if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
nfs_wake_up_inode(inode);
out_nowait:
- unlock_kernel();
return status;
}
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
}
- if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+ if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
nfsi->npages == 0)
- inode->i_size = nfs_size_to_loff_t(fattr->size);
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
}
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
(fattr->valid & NFS_ATTR_WCC) == 0) {
memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
- fattr->pre_size = inode->i_size;
+ fattr->pre_size = i_size_read(inode);
fattr->valid |= NFS_ATTR_WCC;
}
return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
if (nfsi->npages == 0 || new_isize > cur_isize) {
- inode->i_size = new_isize;
+ i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddb..24241fcbb98 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
#ifdef CONFIG_NFS_V4
extern void nfs4_clear_inode(struct inode *);
#endif
+void nfs_zap_acl_cache(struct inode *inode);
/* super.c */
extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde58..a3695281003 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
*
* Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
*
- * NFS client per-mount statistics provide information about the health of
- * the NFS client and the health of each NFS mount point. Generally these
- * are not for detailed problem diagnosis, but simply to indicate that there
- * is a problem.
- *
- * These counters are not meant to be human-readable, but are meant to be
- * integrated into system monitoring tools such as "sar" and "iostat". As
- * such, the counters are sampled by the tools over time, and are never
- * zeroed after a file system is mounted. Moving averages can be computed
- * by the tools by taking the difference between two instantaneous samples
- * and dividing that by the time between the samples.
*/
#ifndef _NFS_IOSTAT
#define _NFS_IOSTAT
-#define NFS_IOSTAT_VERS "1.0"
-
-/*
- * NFS byte counters
- *
- * 1. SERVER - the number of payload bytes read from or written to the
- * server by the NFS client via an NFS READ or WRITE request.
- *
- * 2. NORMAL - the number of bytes read or written by applications via
- * the read(2) and write(2) system call interfaces.
- *
- * 3. DIRECT - the number of bytes read or written from files opened
- * with the O_DIRECT flag.
- *
- * These counters give a view of the data throughput into and out of the NFS
- * client. Comparing the number of bytes requested by an application with the
- * number of bytes the client requests from the server can provide an
- * indication of client efficiency (per-op, cache hits, etc).
- *
- * These counters can also help characterize which access methods are in
- * use. DIRECT by itself shows whether there is any O_DIRECT traffic.
- * NORMAL + DIRECT shows how much data is going through the system call
- * interface. A large amount of SERVER traffic without much NORMAL or
- * DIRECT traffic shows that applications are using mapped files.
- *
- * NFS page counters
- *
- * These count the number of pages read or written via nfs_readpage(),
- * nfs_readpages(), or their write equivalents.
- */
-enum nfs_stat_bytecounters {
- NFSIOS_NORMALREADBYTES = 0,
- NFSIOS_NORMALWRITTENBYTES,
- NFSIOS_DIRECTREADBYTES,
- NFSIOS_DIRECTWRITTENBYTES,
- NFSIOS_SERVERREADBYTES,
- NFSIOS_SERVERWRITTENBYTES,
- NFSIOS_READPAGES,
- NFSIOS_WRITEPAGES,
- __NFSIOS_BYTESMAX,
-};
-
-/*
- * NFS event counters
- *
- * These counters provide a low-overhead way of monitoring client activity
- * without enabling NFS trace debugging. The counters show the rate at
- * which VFS requests are made, and how often the client invalidates its
- * data and attribute caches. This allows system administrators to monitor
- * such things as how close-to-open is working, and answer questions such
- * as "why are there so many GETATTR requests on the wire?"
- *
- * They also count anamolous events such as short reads and writes, silly
- * renames due to close-after-delete, and operations that change the size
- * of a file (such operations can often be the source of data corruption
- * if applications aren't using file locking properly).
- */
-enum nfs_stat_eventcounters {
- NFSIOS_INODEREVALIDATE = 0,
- NFSIOS_DENTRYREVALIDATE,
- NFSIOS_DATAINVALIDATE,
- NFSIOS_ATTRINVALIDATE,
- NFSIOS_VFSOPEN,
- NFSIOS_VFSLOOKUP,
- NFSIOS_VFSACCESS,
- NFSIOS_VFSUPDATEPAGE,
- NFSIOS_VFSREADPAGE,
- NFSIOS_VFSREADPAGES,
- NFSIOS_VFSWRITEPAGE,
- NFSIOS_VFSWRITEPAGES,
- NFSIOS_VFSGETDENTS,
- NFSIOS_VFSSETATTR,
- NFSIOS_VFSFLUSH,
- NFSIOS_VFSFSYNC,
- NFSIOS_VFSLOCK,
- NFSIOS_VFSRELEASE,
- NFSIOS_CONGESTIONWAIT,
- NFSIOS_SETATTRTRUNC,
- NFSIOS_EXTENDWRITE,
- NFSIOS_SILLYRENAME,
- NFSIOS_SHORTREAD,
- NFSIOS_SHORTWRITE,
- NFSIOS_DELAY,
- __NFSIOS_COUNTSMAX,
-};
-
-#ifdef __KERNEL__
-
#include <linux/percpu.h>
#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
struct nfs_iostats {
unsigned long long bytes[__NFSIOS_BYTESMAX];
unsigned long events[__NFSIOS_COUNTSMAX];
} ____cacheline_aligned;
-static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+ enum nfs_stat_eventcounters stat)
{
struct nfs_iostats *iostats;
int cpu;
cpu = get_cpu();
iostats = per_cpu_ptr(server->io_stats, cpu);
- iostats->events[stat] ++;
+ iostats->events[stat]++;
put_cpu_no_resched();
}
-static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_stats(const struct inode *inode,
+ enum nfs_stat_eventcounters stat)
{
nfs_inc_server_stats(NFS_SERVER(inode), stat);
}
-static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+ enum nfs_stat_bytecounters stat,
+ unsigned long addend)
{
struct nfs_iostats *iostats;
int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
put_cpu_no_resched();
}
-static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_stats(const struct inode *inode,
+ enum nfs_stat_bytecounters stat,
+ unsigned long addend)
{
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
free_percpu(stats);
}
-#endif
-#endif
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0..423842f51ac 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
#include <linux/posix_acl_xattr.h>
#include <linux/nfsacl.h>
+#include "internal.h"
+
#define NFSDBG_FACILITY NFSDBG_PROC
ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
status = nfs_revalidate_inode(server, inode);
if (status < 0)
return ERR_PTR(status);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
acl = nfs3_get_cached_acl(inode, type);
if (acl != ERR_PTR(-EAGAIN))
return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
dprintk("NFS call setacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
status = rpc_call_sync(server->client_acl, &msg, 0);
- spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
- spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
dprintk("NFS reply setacl: %d\n", status);
/* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed..1e750e4574a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;
dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
return status;
}
+struct nfs3_createdata {
+ struct rpc_message msg;
+ union {
+ struct nfs3_createargs create;
+ struct nfs3_mkdirargs mkdir;
+ struct nfs3_symlinkargs symlink;
+ struct nfs3_mknodargs mknod;
+ } arg;
+ struct nfs3_diropres res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+ struct nfs3_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_attr = &data->dir_attr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_attr);
+ }
+ return data;
+}
+
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+ int status;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ nfs_post_op_update_inode(dir, data->res.dir_attr);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ return status;
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+ kfree(data);
+}
+
/*
* Create a regular file.
* For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags, struct nameidata *nd)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
- struct nfs_fattr dir_attr;
- struct nfs3_createargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr,
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
mode_t mode = sattr->ia_mode;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call create %s\n", dentry->d_name.name);
- arg.createmode = NFS3_CREATE_UNCHECKED;
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+ data->arg.create.fh = NFS_FH(dir);
+ data->arg.create.name = dentry->d_name.name;
+ data->arg.create.len = dentry->d_name.len;
+ data->arg.create.sattr = sattr;
+
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
if (flags & O_EXCL) {
- arg.createmode = NFS3_CREATE_EXCLUSIVE;
- arg.verifier[0] = jiffies;
- arg.verifier[1] = current->pid;
+ data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
+ data->arg.create.verifier[0] = jiffies;
+ data->arg.create.verifier[1] = current->pid;
}
sattr->ia_mode &= ~current->fs->umask;
-again:
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ for (;;) {
+ status = nfs3_do_create(dir, dentry, data);
- /* If the server doesn't support the exclusive creation semantics,
- * try again with simple 'guarded' mode. */
- if (status == -ENOTSUPP) {
- switch (arg.createmode) {
+ if (status != -ENOTSUPP)
+ break;
+ /* If the server doesn't support the exclusive creation
+ * semantics, try again with simple 'guarded' mode. */
+ switch (data->arg.create.createmode) {
case NFS3_CREATE_EXCLUSIVE:
- arg.createmode = NFS3_CREATE_GUARDED;
+ data->arg.create.createmode = NFS3_CREATE_GUARDED;
break;
case NFS3_CREATE_GUARDED:
- arg.createmode = NFS3_CREATE_UNCHECKED;
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
break;
case NFS3_CREATE_UNCHECKED:
goto out;
}
- goto again;
+ nfs_fattr_init(data->res.dir_attr);
+ nfs_fattr_init(data->res.fattr);
}
- if (status == 0)
- status = nfs_instantiate(dentry, &fhandle, &fattr);
if (status != 0)
goto out;
/* When we created the file with exclusive semantics, make
* sure we set the attributes afterwards. */
- if (arg.createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
dprintk("NFS call setattr (post-create)\n");
if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
/* Note: we could use a guarded setattr here, but I'm
* not sure this buys us anything (and I'd have
* to revamp the NFSv3 XDR code) */
- status = nfs3_proc_setattr(dentry, &fattr, sattr);
- nfs_post_op_update_inode(dentry->d_inode, &fattr);
+ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+ nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
+ if (status != 0)
+ goto out;
}
- if (status != 0)
- goto out;
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply create: %d\n", status);
return status;
}
@@ -452,40 +492,28 @@ static int
nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
unsigned int len, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_symlinkargs arg = {
- .fromfh = NFS_FH(dir),
- .fromname = dentry->d_name.name,
- .fromlen = dentry->d_name.len,
- .pages = &page,
- .pathlen = len,
- .sattr = sattr
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs3_createdata *data;
+ int status = -ENOMEM;
if (len > NFS3_MAXPATHLEN)
return -ENAMETOOLONG;
dprintk("NFS call symlink %s\n", dentry->d_name.name);
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+ data->arg.symlink.fromfh = NFS_FH(dir);
+ data->arg.symlink.fromname = dentry->d_name.name;
+ data->arg.symlink.fromlen = dentry->d_name.len;
+ data->arg.symlink.pages = &page;
+ data->arg.symlink.pathlen = len;
+ data->arg.symlink.sattr = sattr;
+
+ status = nfs3_do_create(dir, dentry, data);
+
+ nfs3_free_createdata(data);
out:
dprintk("NFS reply symlink: %d\n", status);
return status;
@@ -494,42 +522,31 @@ out:
static int
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_mkdirargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
int mode = sattr->ia_mode;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
sattr->ia_mode &= ~current->fs->umask;
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+ data->arg.mkdir.fh = NFS_FH(dir);
+ data->arg.mkdir.name = dentry->d_name.name;
+ data->arg.mkdir.len = dentry->d_name.len;
+ data->arg.mkdir.sattr = sattr;
+
+ status = nfs3_do_create(dir, dentry, data);
if (status != 0)
goto out;
+
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply mkdir: %d\n", status);
return status;
}
@@ -615,52 +632,50 @@ static int
nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
dev_t rdev)
{
- struct nfs_fh fh;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_mknodargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr,
- .rdev = rdev
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fh,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
mode_t mode = sattr->ia_mode;
- int status;
-
- switch (sattr->ia_mode & S_IFMT) {
- case S_IFBLK: arg.type = NF3BLK; break;
- case S_IFCHR: arg.type = NF3CHR; break;
- case S_IFIFO: arg.type = NF3FIFO; break;
- case S_IFSOCK: arg.type = NF3SOCK; break;
- default: return -EINVAL;
- }
+ int status = -ENOMEM;
dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
MAJOR(rdev), MINOR(rdev));
sattr->ia_mode &= ~current->fs->umask;
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fh, &fattr);
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+ data->arg.mknod.fh = NFS_FH(dir);
+ data->arg.mknod.name = dentry->d_name.name;
+ data->arg.mknod.len = dentry->d_name.len;
+ data->arg.mknod.sattr = sattr;
+ data->arg.mknod.rdev = rdev;
+
+ switch (sattr->ia_mode & S_IFMT) {
+ case S_IFBLK:
+ data->arg.mknod.type = NF3BLK;
+ break;
+ case S_IFCHR:
+ data->arg.mknod.type = NF3CHR;
+ break;
+ case S_IFIFO:
+ data->arg.mknod.type = NF3FIFO;
+ break;
+ case S_IFSOCK:
+ data->arg.mknod.type = NF3SOCK;
+ break;
+ default:
+ status = -EINVAL;
+ goto out;
+ }
+
+ status = nfs3_do_create(dir, dentry, data);
if (status != 0)
goto out;
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply mknod: %d\n", status);
return status;
}
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_done = nfs3_commit_done,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82..c910413eaec 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
/* Save the delegation */
memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
rcu_read_unlock();
- lock_kernel();
ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
- unlock_kernel();
if (ret != 0)
goto out;
ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
return res;
}
-static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
- struct iattr *sattr, struct nfs4_state *state)
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
.server = server,
};
struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
};
unsigned long timestamp = jiffies;
int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
/* Use that stateid */
} else if (state != NULL) {
- msg.rpc_cred = state->owner->so_cred;
nfs4_copy_stateid(&arg.stateid, state, current->files);
} else
memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
return status;
}
-static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
- struct iattr *sattr, struct nfs4_state *state)
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_do_setattr(inode, fattr, sattr, state),
+ _nfs4_do_setattr(inode, cred, fattr, sattr, state),
&exception);
} while (exception.retry);
return err;
@@ -1647,29 +1647,25 @@ static int
nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
- struct rpc_cred *cred;
struct inode *inode = dentry->d_inode;
- struct nfs_open_context *ctx;
+ struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
int status;
nfs_fattr_init(fattr);
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return PTR_ERR(cred);
-
/* Search for an existing open(O_WRITE) file */
- ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
- if (ctx != NULL)
+ if (sattr->ia_valid & ATTR_FILE) {
+ struct nfs_open_context *ctx;
+
+ ctx = nfs_file_open_context(sattr->ia_file);
+ cred = ctx->cred;
state = ctx->state;
+ }
- status = nfs4_do_setattr(inode, fattr, sattr, state);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(inode, sattr);
- if (ctx != NULL)
- put_nfs_open_context(ctx);
- put_rpccred(cred);
return status;
}
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
}
state = nfs4_do_open(dir, &path, flags, sattr, cred);
- put_rpccred(cred);
d_drop(dentry);
if (IS_ERR(state)) {
status = PTR_ERR(state);
- goto out;
+ goto out_putcred;
}
d_add(dentry, igrab(state->inode));
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
if (flags & O_EXCL) {
struct nfs_fattr fattr;
- status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
+ status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(state->inode, sattr);
nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = nfs4_intent_set_file(nd, &path, state);
else
nfs4_close_sync(&path, state, flags);
+out_putcred:
+ put_rpccred(cred);
out:
return status;
}
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
return err;
}
+struct nfs4_createdata {
+ struct rpc_message msg;
+ struct nfs4_create_arg arg;
+ struct nfs4_create_res res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_fattr;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+ struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+ struct nfs4_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->arg.dir_fh = NFS_FH(dir);
+ data->arg.server = server;
+ data->arg.name = name;
+ data->arg.attrs = sattr;
+ data->arg.ftype = ftype;
+ data->arg.bitmask = server->attr_bitmask;
+ data->res.server = server;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_fattr = &data->dir_fattr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_fattr);
+ }
+ return data;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+ int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ if (status == 0) {
+ update_changeattr(dir, &data->res.dir_cinfo);
+ nfs_post_op_update_inode(dir, data->res.dir_fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ }
+ return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+ kfree(data);
+}
+
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct page *page, unsigned int len, struct iattr *sattr)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .ftype = NF4LNK,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fhandle,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs4_createdata *data;
+ int status = -ENAMETOOLONG;
if (len > NFS4_MAXPATHLEN)
- return -ENAMETOOLONG;
+ goto out;
- arg.u.symlink.pages = &page;
- arg.u.symlink.len = len;
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
+ status = -ENOMEM;
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+ data->arg.u.symlink.pages = &page;
+ data->arg.u.symlink.len = len;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (!status) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .ftype = NF4DIR,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fhandle,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs4_createdata *data;
+ int status = -ENOMEM;
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
-
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (!status) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+ if (data == NULL)
+ goto out;
+
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fh;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fh,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
- int mode = sattr->ia_mode;
-
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
+ struct nfs4_createdata *data;
+ int mode = sattr->ia_mode;
+ int status = -ENOMEM;
BUG_ON(!(sattr->ia_valid & ATTR_MODE));
BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+ if (data == NULL)
+ goto out;
+
if (S_ISFIFO(mode))
- arg.ftype = NF4FIFO;
+ data->arg.ftype = NF4FIFO;
else if (S_ISBLK(mode)) {
- arg.ftype = NF4BLK;
- arg.u.device.specdata1 = MAJOR(rdev);
- arg.u.device.specdata2 = MINOR(rdev);
+ data->arg.ftype = NF4BLK;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
}
else if (S_ISCHR(mode)) {
- arg.ftype = NF4CHR;
- arg.u.device.specdata1 = MAJOR(rdev);
- arg.u.device.specdata2 = MINOR(rdev);
+ data->arg.ftype = NF4CHR;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
}
- else
- arg.ftype = NF4SOCK;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (status == 0) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fh, &fattr);
- }
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
ret = nfs_revalidate_inode(server, inode);
if (ret < 0)
return ret;
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
ret = nfs4_read_cached_acl(inode, buf, buflen);
if (ret != -ENOENT)
return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
nfs_inode_return_delegation(inode);
buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- nfs_zap_caches(inode);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
return ret;
}
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
task->tk_status = 0;
return -EAGAIN;
case -NFS4ERR_DELAY:
- nfs_inc_server_stats((struct nfs_server *) server,
- NFSIOS_DELAY);
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
{
- long timeout;
+ long timeout = 0;
int err;
do {
err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.write_done = nfs4_write_done,
.commit_setup = nfs4_proc_commit_setup,
.commit_done = nfs4_commit_done,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs4_proc_lock,
.clear_acl_cache = nfs4_zap_acl_attr,
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f61..401ef8b28f9 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
allow_signal(SIGKILL);
/* Ensure exclusive access to NFSv4 state */
- lock_kernel();
down_write(&clp->cl_sem);
/* Are there any NFS mounts out there? */
if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
nfs_delegation_reap_unclaimed(clp);
out:
up_write(&clp->cl_sem);
- unlock_kernel();
if (status == -NFS4ERR_CB_PATH_DOWN)
nfs_handle_cb_pathdown(clp);
nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d3682..46763d1cd39 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
/*
- * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
- *
* Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
*
* Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
- nfs_data.acregmin = 3;
- nfs_data.acregmax = 60;
- nfs_data.acdirmin = 30;
- nfs_data.acdirmax = 60;
+ nfs_data.acregmin = NFS_DEF_ACREGMIN;
+ nfs_data.acregmax = NFS_DEF_ACREGMAX;
+ nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
+ nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
strcpy(buf, NFS_ROOT);
/* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81c..4dbb84df1b6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
sattr->ia_mode &= S_IALLUGO;
dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
}
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+ __s32 start, end;
+
+ start = (__s32)fl->fl_start;
+ if ((loff_t)start != fl->fl_start)
+ goto out_einval;
+
+ if (fl->fl_end != OFFSET_MAX) {
+ end = (__s32)fl->fl_end;
+ if ((loff_t)end != fl->fl_end)
+ goto out_einval;
+ } else
+ end = NFS_LOCK32_OFFSET_MAX;
+
+ if (start < 0 || start > end)
+ goto out_einval;
+ return 0;
+out_einval:
+ return -EINVAL;
+}
const struct nfs_rpc_ops nfs_v2_clientops = {
.version = 2, /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.write_setup = nfs_proc_write_setup,
.write_done = nfs_write_done,
.commit_setup = nfs_proc_commit_setup,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs_proc_lock,
+ .lock_check_bounds = nfs_lock_check_bounds,
};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed543..1b94e3650f5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
#include <linux/inet.h>
#include <linux/in6.h>
#include <net/ipv6.h>
+#include <linux/netdevice.h>
#include <linux/nfs_xdr.h>
#include <linux/magic.h>
#include <linux/parser.h>
@@ -65,7 +66,6 @@
enum {
/* Mount options that take no arguments */
Opt_soft, Opt_hard,
- Opt_intr, Opt_nointr,
Opt_posix, Opt_noposix,
Opt_cto, Opt_nocto,
Opt_ac, Opt_noac,
@@ -92,8 +92,8 @@ enum {
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
- /* Mount options that are ignored */
- Opt_userspace, Opt_deprecated,
+ /* Special mount options */
+ Opt_userspace, Opt_deprecated, Opt_sloppy,
Opt_err
};
@@ -101,10 +101,14 @@ enum {
static match_table_t nfs_mount_option_tokens = {
{ Opt_userspace, "bg" },
{ Opt_userspace, "fg" },
+ { Opt_userspace, "retry=%s" },
+
+ { Opt_sloppy, "sloppy" },
+
{ Opt_soft, "soft" },
{ Opt_hard, "hard" },
- { Opt_intr, "intr" },
- { Opt_nointr, "nointr" },
+ { Opt_deprecated, "intr" },
+ { Opt_deprecated, "nointr" },
{ Opt_posix, "posix" },
{ Opt_noposix, "noposix" },
{ Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
{ Opt_acdirmin, "acdirmin=%u" },
{ Opt_acdirmax, "acdirmax=%u" },
{ Opt_actimeo, "actimeo=%u" },
- { Opt_userspace, "retry=%u" },
{ Opt_namelen, "namlen=%u" },
{ Opt_mountport, "mountport=%u" },
{ Opt_mountvers, "mountvers=%u" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static void nfs_kill_super(struct super_block *);
static void nfs_put_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
static struct file_system_type nfs_fs_type = {
.owner = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
.umount_begin = nfs_umount_begin,
.show_options = nfs_show_options,
.show_stats = nfs_show_stats,
+ .remount_fs = nfs_remount,
};
#ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
.umount_begin = nfs_umount_begin,
.show_options = nfs_show_options,
.show_stats = nfs_show_stats,
+ .remount_fs = nfs_remount,
};
#endif
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
};
int error;
- lock_kernel();
-
error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
if (error < 0)
goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = server->namelen;
- unlock_kernel();
return 0;
out_err:
dprintk("%s: statfs error = %d\n", __func__, -error);
- unlock_kernel();
return error;
}
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (nfss->bsize != 0)
seq_printf(m, ",bsize=%u", nfss->bsize);
seq_printf(m, ",namlen=%u", nfss->namelen);
- if (nfss->acregmin != 3*HZ || showdefaults)
+ if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
- if (nfss->acregmax != 60*HZ || showdefaults)
+ if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
- if (nfss->acdirmin != 30*HZ || showdefaults)
+ if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
- if (nfss->acdirmax != 60*HZ || showdefaults)
+ if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
return 0;
}
+static void nfs_parse_ipv4_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+
+ if (str_len <= INET_ADDRSTRLEN) {
+ dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
+ (int)str_len, string);
+
+ sin->sin_family = AF_INET;
+ *addr_len = sizeof(*sin);
+ if (in4_pton(string, str_len, addr, '\0', NULL))
+ return;
+ }
+
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+
+#define IPV6_SCOPE_DELIMITER '%'
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+ const char *delim,
+ struct sockaddr_in6 *sin6)
+{
+ char *p;
+ size_t len;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return ;
+ if (*delim != IPV6_SCOPE_DELIMITER)
+ return;
+
+ len = (string + str_len) - delim - 1;
+ p = kstrndup(delim + 1, len, GFP_KERNEL);
+ if (p) {
+ unsigned long scope_id = 0;
+ struct net_device *dev;
+
+ dev = dev_get_by_name(&init_net, p);
+ if (dev != NULL) {
+ scope_id = dev->ifindex;
+ dev_put(dev);
+ } else {
+ /* scope_id is set to zero on error */
+ strict_strtoul(p, 10, &scope_id);
+ }
+
+ kfree(p);
+ sin6->sin6_scope_id = scope_id;
+ dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+ }
+}
+
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+ const char *delim;
+
+ if (str_len <= INET6_ADDRSTRLEN) {
+ dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
+ (int)str_len, string);
+
+ sin6->sin6_family = AF_INET6;
+ *addr_len = sizeof(*sin6);
+ if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
+ nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
+ return;
+ }
+ }
+
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+#else
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+#endif
+
/*
- * Parse string addresses passed in via a mount option,
- * and construct a sockaddr based on the result.
+ * Construct a sockaddr based on the contents of a string that contains
+ * an IP address in presentation format.
*
- * If address parsing fails, set the sockaddr's address
- * family to AF_UNSPEC to force nfs_verify_server_address()
- * to punt the mount.
+ * If there is a problem constructing the new sockaddr, set the address
+ * family to AF_UNSPEC.
*/
-static void nfs_parse_server_address(char *value,
- struct sockaddr *sap,
- size_t *len)
+static void nfs_parse_ip_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
{
- if (strchr(value, ':')) {
- struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
- u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
+ unsigned int i, colons;
- ap->sin6_family = AF_INET6;
- *len = sizeof(*ap);
- if (in6_pton(value, -1, addr, '\0', NULL))
- return;
- } else {
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+ colons = 0;
+ for (i = 0; i < str_len; i++)
+ if (string[i] == ':')
+ colons++;
+
+ if (colons >= 2)
+ nfs_parse_ipv6_address(string, str_len, sap, addr_len);
+ else
+ nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+}
+
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+ switch (mnt->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ break;
+ default:
+ mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+ nfs_validate_transport_protocol(mnt);
- ap->sin_family = AF_INET;
- *len = sizeof(*ap);
- if (in4_pton(value, -1, addr, '\0', NULL))
+ if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+ mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
return;
+ switch (mnt->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
}
+}
- sap->sa_family = AF_UNSPEC;
- *len = 0;
+/*
+ * Parse the value of the 'sec=' option.
+ *
+ * The flavor_len setting is for v4 mounts.
+ */
+static int nfs_parse_security_flavors(char *value,
+ struct nfs_parsed_mount_data *mnt)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+
+ switch (match_token(value, nfs_secflavor_tokens, args)) {
+ case Opt_sec_none:
+ mnt->auth_flavor_len = 0;
+ mnt->auth_flavors[0] = RPC_AUTH_NULL;
+ break;
+ case Opt_sec_sys:
+ mnt->auth_flavor_len = 0;
+ mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+ break;
+ case Opt_sec_krb5:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+ break;
+ case Opt_sec_krb5p:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+ break;
+ case Opt_sec_lkey:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+ break;
+ case Opt_sec_lkeyi:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+ break;
+ case Opt_sec_lkeyp:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+ break;
+ case Opt_sec_spkm:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+ break;
+ case Opt_sec_spkmi:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+ break;
+ case Opt_sec_spkmp:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static void nfs_parse_invalid_value(const char *option)
+{
+ dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option);
}
/*
* Error-check and convert a string of mount options from user space into
- * a data structure
+ * a data structure. The whole mount string is processed; bad options are
+ * skipped as they are encountered. If there were no errors, return 1;
+ * otherwise return 0 (zero).
*/
static int nfs_parse_mount_options(char *raw,
struct nfs_parsed_mount_data *mnt)
{
char *p, *string, *secdata;
- int rc;
+ int rc, sloppy = 0, errors = 0;
if (!raw) {
dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
token = match_token(p, nfs_mount_option_tokens, args);
switch (token) {
+
+ /*
+ * boolean options: foo/nofoo
+ */
case Opt_soft:
mnt->flags |= NFS_MOUNT_SOFT;
break;
case Opt_hard:
mnt->flags &= ~NFS_MOUNT_SOFT;
break;
- case Opt_intr:
- case Opt_nointr:
- break;
case Opt_posix:
mnt->flags |= NFS_MOUNT_POSIX;
break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- mnt->timeo = 7;
- mnt->retrans = 5;
break;
case Opt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_rdma:
mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_acl:
mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_UNSHARED;
break;
+ /*
+ * options that take numeric values
+ */
case Opt_port:
- if (match_int(args, &option))
- return 0;
- if (option < 0 || option > 65535)
- return 0;
- mnt->nfs_server.port = option;
+ if (match_int(args, &option) ||
+ option < 0 || option > USHORT_MAX) {
+ errors++;
+ nfs_parse_invalid_value("port");
+ } else
+ mnt->nfs_server.port = option;
break;
case Opt_rsize:
- if (match_int(args, &mnt->rsize))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("rsize");
+ } else
+ mnt->rsize = option;
break;
case Opt_wsize:
- if (match_int(args, &mnt->wsize))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("wsize");
+ } else
+ mnt->wsize = option;
break;
case Opt_bsize:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->bsize = option;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("bsize");
+ } else
+ mnt->bsize = option;
break;
case Opt_timeo:
- if (match_int(args, &mnt->timeo))
- return 0;
+ if (match_int(args, &option) || option <= 0) {
+ errors++;
+ nfs_parse_invalid_value("timeo");
+ } else
+ mnt->timeo = option;
break;
case Opt_retrans:
- if (match_int(args, &mnt->retrans))
- return 0;
+ if (match_int(args, &option) || option <= 0) {
+ errors++;
+ nfs_parse_invalid_value("retrans");
+ } else
+ mnt->retrans = option;
break;
case Opt_acregmin:
- if (match_int(args, &mnt->acregmin))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acregmin");
+ } else
+ mnt->acregmin = option;
break;
case Opt_acregmax:
- if (match_int(args, &mnt->acregmax))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acregmax");
+ } else
+ mnt->acregmax = option;
break;
case Opt_acdirmin:
- if (match_int(args, &mnt->acdirmin))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acdirmin");
+ } else
+ mnt->acdirmin = option;
break;
case Opt_acdirmax:
- if (match_int(args, &mnt->acdirmax))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acdirmax");
+ } else
+ mnt->acdirmax = option;
break;
case Opt_actimeo:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->acregmin =
- mnt->acregmax =
- mnt->acdirmin =
- mnt->acdirmax = option;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("actimeo");
+ } else
+ mnt->acregmin = mnt->acregmax =
+ mnt->acdirmin = mnt->acdirmax = option;
break;
case Opt_namelen:
- if (match_int(args, &mnt->namlen))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("namlen");
+ } else
+ mnt->namlen = option;
break;
case Opt_mountport:
- if (match_int(args, &option))
- return 0;
- if (option < 0 || option > 65535)
- return 0;
- mnt->mount_server.port = option;
+ if (match_int(args, &option) ||
+ option < 0 || option > USHORT_MAX) {
+ errors++;
+ nfs_parse_invalid_value("mountport");
+ } else
+ mnt->mount_server.port = option;
break;
case Opt_mountvers:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->mount_server.version = option;
+ if (match_int(args, &option) ||
+ option < NFS_MNT_VERSION ||
+ option > NFS_MNT3_VERSION) {
+ errors++;
+ nfs_parse_invalid_value("mountvers");
+ } else
+ mnt->mount_server.version = option;
break;
case Opt_nfsvers:
- if (match_int(args, &option))
- return 0;
+ if (match_int(args, &option)) {
+ errors++;
+ nfs_parse_invalid_value("nfsvers");
+ break;
+ }
switch (option) {
- case 2:
+ case NFS2_VERSION:
mnt->flags &= ~NFS_MOUNT_VER3;
break;
- case 3:
+ case NFS3_VERSION:
mnt->flags |= NFS_MOUNT_VER3;
break;
default:
- goto out_unrec_vers;
+ errors++;
+ nfs_parse_invalid_value("nfsvers");
}
break;
+ /*
+ * options that take text values
+ */
case Opt_sec:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- token = match_token(string, nfs_secflavor_tokens, args);
+ rc = nfs_parse_security_flavors(string, mnt);
kfree(string);
-
- /*
- * The flags setting is for v2/v3. The flavor_len
- * setting is for v4. v2/v3 also need to know the
- * difference between NULL and UNIX.
- */
- switch (token) {
- case Opt_sec_none:
- mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 0;
- mnt->auth_flavors[0] = RPC_AUTH_NULL;
- break;
- case Opt_sec_sys:
- mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 0;
- mnt->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case Opt_sec_krb5:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
- break;
- case Opt_sec_krb5i:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
- break;
- case Opt_sec_krb5p:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
- break;
- case Opt_sec_lkey:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
- break;
- case Opt_sec_lkeyi:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
- break;
- case Opt_sec_lkeyp:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
- break;
- case Opt_sec_spkm:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
- break;
- case Opt_sec_spkmi:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
- break;
- case Opt_sec_spkmp:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
- break;
- default:
- goto out_unrec_sec;
+ if (!rc) {
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "security flavor\n");
}
break;
case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
case Opt_xprt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- mnt->timeo = 7;
- mnt->retrans = 5;
break;
case Opt_xprt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
default:
- goto out_unrec_xprt;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "transport protocol\n");
}
break;
case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
break;
case Opt_xprt_rdma: /* not used for side protocols */
default:
- goto out_unrec_xprt;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "transport protocol\n");
}
break;
case Opt_addr:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_server_address(string, (struct sockaddr *)
- &mnt->nfs_server.address,
- &mnt->nfs_server.addrlen);
+ nfs_parse_ip_address(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->nfs_server.address,
+ &mnt->nfs_server.addrlen);
kfree(string);
break;
case Opt_clientaddr:
@@ -1093,24 +1252,33 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_server_address(string, (struct sockaddr *)
- &mnt->mount_server.address,
- &mnt->mount_server.addrlen);
+ nfs_parse_ip_address(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->mount_server.address,
+ &mnt->mount_server.addrlen);
kfree(string);
break;
+ /*
+ * Special options
+ */
+ case Opt_sloppy:
+ sloppy = 1;
+ dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
+ break;
case Opt_userspace:
case Opt_deprecated:
+ dfprintk(MOUNT, "NFS: ignoring mount option "
+ "'%s'\n", p);
break;
default:
- goto out_unknown;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized mount option "
+ "'%s'\n", p);
}
}
- nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
- mnt->nfs_server.port);
-
return 1;
out_nomem:
@@ -1120,21 +1288,6 @@ out_security_failure:
free_secdata(secdata);
printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
return 0;
-out_unrec_vers:
- printk(KERN_INFO "NFS: unrecognized NFS version number\n");
- return 0;
-
-out_unrec_xprt:
- printk(KERN_INFO "NFS: unrecognized transport protocol\n");
- return 0;
-
-out_unrec_sec:
- printk(KERN_INFO "NFS: unrecognized security flavor\n");
- return 0;
-
-out_unknown:
- printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
- return 0;
}
/*
@@ -1188,11 +1341,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
if (status == 0)
return 0;
- dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
hostname, status);
return status;
}
+static int nfs_parse_simple_hostname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ size_t len;
+ char *colon, *comma;
+
+ colon = strchr(dev_name, ':');
+ if (colon == NULL)
+ goto out_bad_devname;
+
+ len = colon - dev_name;
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+ if (!*hostname)
+ goto out_nomem;
+
+ /* kill possible hostname list: not supported */
+ comma = strchr(*hostname, ',');
+ if (comma != NULL) {
+ if (comma == *hostname)
+ goto out_bad_devname;
+ *comma = '\0';
+ }
+
+ colon++;
+ len = strlen(colon);
+ if (len > maxpathlen)
+ goto out_path;
+ *export_path = kstrndup(colon, len, GFP_KERNEL);
+ if (!*export_path)
+ goto out_nomem;
+
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+ return 0;
+
+out_bad_devname:
+ dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+ return -EINVAL;
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+ return -ENOMEM;
+
+out_hostname:
+ dfprintk(MOUNT, "NFS: server hostname too long\n");
+ return -ENAMETOOLONG;
+
+out_path:
+ dfprintk(MOUNT, "NFS: export pathname too long\n");
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Hostname has square brackets around it because it contains one or
+ * more colons. We look for the first closing square bracket, and a
+ * colon must follow it.
+ */
+static int nfs_parse_protected_hostname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ size_t len;
+ char *start, *end;
+
+ start = (char *)(dev_name + 1);
+
+ end = strchr(start, ']');
+ if (end == NULL)
+ goto out_bad_devname;
+ if (*(end + 1) != ':')
+ goto out_bad_devname;
+
+ len = end - start;
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ *hostname = kstrndup(start, len, GFP_KERNEL);
+ if (*hostname == NULL)
+ goto out_nomem;
+
+ end += 2;
+ len = strlen(end);
+ if (len > maxpathlen)
+ goto out_path;
+ *export_path = kstrndup(end, len, GFP_KERNEL);
+ if (!*export_path)
+ goto out_nomem;
+
+ return 0;
+
+out_bad_devname:
+ dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+ return -EINVAL;
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+ return -ENOMEM;
+
+out_hostname:
+ dfprintk(MOUNT, "NFS: server hostname too long\n");
+ return -ENAMETOOLONG;
+
+out_path:
+ dfprintk(MOUNT, "NFS: export pathname too long\n");
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path. If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ if (*dev_name == '[')
+ return nfs_parse_protected_hostname(dev_name,
+ hostname, maxnamlen,
+ export_path, maxpathlen);
+
+ return nfs_parse_simple_hostname(dev_name,
+ hostname, maxnamlen,
+ export_path, maxpathlen);
+}
+
/*
* Validate the NFS2/NFS3 mount data
* - fills in the mount root filehandle
@@ -1222,16 +1510,14 @@ static int nfs_validate_mount_data(void *options,
args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
args->rsize = NFS_MAX_FILE_IO_SIZE;
args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->timeo = 600;
- args->retrans = 2;
- args->acregmin = 3;
- args->acregmax = 60;
- args->acdirmin = 30;
- args->acdirmax = 60;
+ args->acregmin = NFS_DEF_ACREGMIN;
+ args->acregmax = NFS_DEF_ACREGMAX;
+ args->acdirmin = NFS_DEF_ACDIRMIN;
+ args->acdirmax = NFS_DEF_ACDIRMAX;
args->mount_server.port = 0; /* autobind unless user sets port */
- args->mount_server.protocol = XPRT_TRANSPORT_UDP;
args->nfs_server.port = 0; /* autobind unless user sets port */
args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
switch (data->version) {
case 1:
@@ -1289,7 +1575,9 @@ static int nfs_validate_mount_data(void *options,
args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
args->namlen = data->namlen;
args->bsize = data->bsize;
- args->auth_flavors[0] = data->pseudoflavor;
+
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ args->auth_flavors[0] = data->pseudoflavor;
if (!args->nfs_server.hostname)
goto out_nomem;
@@ -1321,8 +1609,6 @@ static int nfs_validate_mount_data(void *options,
break;
default: {
- unsigned int len;
- char *c;
int status;
if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1332,21 +1618,22 @@ static int nfs_validate_mount_data(void *options,
&args->nfs_server.address))
goto out_no_address;
- c = strchr(dev_name, ':');
- if (c == NULL)
- return -EINVAL;
- len = c - dev_name;
- /* N.B. caller will free nfs_server.hostname in all cases */
- args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!args->nfs_server.hostname)
- goto out_nomem;
+ nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+ args->nfs_server.port);
- c++;
- if (strlen(c) > NFS_MAXPATHLEN)
- return -ENAMETOOLONG;
- args->nfs_server.export_path = c;
+ nfs_set_mount_transport_protocol(args);
+
+ status = nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ PAGE_SIZE,
+ &args->nfs_server.export_path,
+ NFS_MAXPATHLEN);
+ if (!status)
+ status = nfs_try_mount(args, mntfh);
+
+ kfree(args->nfs_server.export_path);
+ args->nfs_server.export_path = NULL;
- status = nfs_try_mount(args, mntfh);
if (status)
return status;
@@ -1354,9 +1641,6 @@ static int nfs_validate_mount_data(void *options,
}
}
- if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
- args->auth_flavors[0] = RPC_AUTH_UNIX;
-
#ifndef CONFIG_NFS_V3
if (args->flags & NFS_MOUNT_VER3)
goto out_v3_not_compiled;
@@ -1396,6 +1680,80 @@ out_invalid_fh:
return -EINVAL;
}
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+ struct nfs_parsed_mount_data *data)
+{
+ if (data->flags != nfss->flags ||
+ data->rsize != nfss->rsize ||
+ data->wsize != nfss->wsize ||
+ data->retrans != nfss->client->cl_timeout->to_retries ||
+ data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+ data->acregmin != nfss->acregmin / HZ ||
+ data->acregmax != nfss->acregmax / HZ ||
+ data->acdirmin != nfss->acdirmin / HZ ||
+ data->acdirmax != nfss->acdirmax / HZ ||
+ data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+ memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+ data->nfs_server.addrlen) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+ int error;
+ struct nfs_server *nfss = sb->s_fs_info;
+ struct nfs_parsed_mount_data *data;
+ struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+ struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+ u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+
+ /*
+ * Userspace mount programs that send binary options generally send
+ * them populated with default values. We have no way to know which
+ * ones were explicitly specified. Fall back to legacy behavior and
+ * just return success.
+ */
+ if ((nfsvers == 4 && options4->version == 1) ||
+ (nfsvers <= 3 && options->version >= 1 &&
+ options->version <= 6))
+ return 0;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+
+ /* fill out struct with values from existing mount */
+ data->flags = nfss->flags;
+ data->rsize = nfss->rsize;
+ data->wsize = nfss->wsize;
+ data->retrans = nfss->client->cl_timeout->to_retries;
+ data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+ data->acregmin = nfss->acregmin / HZ;
+ data->acregmax = nfss->acregmax / HZ;
+ data->acdirmin = nfss->acdirmin / HZ;
+ data->acdirmax = nfss->acdirmax / HZ;
+ data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+ memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+ data->nfs_server.addrlen);
+
+ /* overwrite those values with any that were specified */
+ error = nfs_parse_mount_options((char *)options, data);
+ if (error < 0)
+ goto out;
+
+ /* compare new mount options with old ones */
+ error = nfs_compare_remount_data(nfss, data);
+out:
+ kfree(data);
+ return error;
+}
+
/*
* Initialise the common bits of the superblock
*/
@@ -1811,14 +2169,13 @@ static int nfs4_validate_mount_data(void *options,
args->rsize = NFS_MAX_FILE_IO_SIZE;
args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->timeo = 600;
- args->retrans = 2;
- args->acregmin = 3;
- args->acregmax = 60;
- args->acdirmin = 30;
- args->acdirmax = 60;
+ args->acregmin = NFS_DEF_ACREGMIN;
+ args->acregmax = NFS_DEF_ACREGMAX;
+ args->acdirmin = NFS_DEF_ACDIRMIN;
+ args->acdirmax = NFS_DEF_ACDIRMAX;
args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
- args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
+ args->auth_flavor_len = 0;
switch (data->version) {
case 1:
@@ -1834,18 +2191,13 @@ static int nfs4_validate_mount_data(void *options,
&args->nfs_server.address))
goto out_no_address;
- switch (data->auth_flavourlen) {
- case 0:
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case 1:
+ if (data->auth_flavourlen) {
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
if (copy_from_user(&args->auth_flavors[0],
data->auth_flavours,
sizeof(args->auth_flavors[0])))
return -EFAULT;
- break;
- default:
- goto out_inval_auth;
}
c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1879,10 +2231,11 @@ static int nfs4_validate_mount_data(void *options,
args->acdirmin = data->acdirmin;
args->acdirmax = data->acdirmax;
args->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(args);
break;
default: {
- unsigned int len;
+ int status;
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
@@ -1891,44 +2244,25 @@ static int nfs4_validate_mount_data(void *options,
&args->nfs_server.address))
return -EINVAL;
- switch (args->auth_flavor_len) {
- case 0:
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case 1:
- break;
- default:
- goto out_inval_auth;
- }
+ nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+ args->nfs_server.port);
- /*
- * Split "dev_name" into "hostname:mntpath".
- */
- c = strchr(dev_name, ':');
- if (c == NULL)
- return -EINVAL;
- /* while calculating len, pretend ':' is '\0' */
- len = c - dev_name;
- if (len > NFS4_MAXNAMLEN)
- return -ENAMETOOLONG;
- /* N.B. caller will free nfs_server.hostname in all cases */
- args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!args->nfs_server.hostname)
- goto out_nomem;
-
- c++; /* step over the ':' */
- len = strlen(c);
- if (len > NFS4_MAXPATHLEN)
- return -ENAMETOOLONG;
- args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
- if (!args->nfs_server.export_path)
- goto out_nomem;
+ nfs_validate_transport_protocol(args);
- dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
+ if (args->auth_flavor_len > 1)
+ goto out_inval_auth;
if (args->client_address == NULL)
goto out_no_client_address;
+ status = nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ NFS4_MAXNAMLEN,
+ &args->nfs_server.export_path,
+ NFS4_MAXPATHLEN);
+ if (status < 0)
+ return status;
+
break;
}
}
@@ -1944,10 +2278,6 @@ out_inval_auth:
data->auth_flavourlen);
return -EINVAL;
-out_nomem:
- dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
- return -ENOMEM;
-
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3b..3229e217c77 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
/*
* Local function declarations
*/
-static struct nfs_page * nfs_update_request(struct nfs_open_context*,
- struct page *,
- unsigned int, unsigned int);
static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
struct inode *inode, int ioflags);
static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
struct inode *inode = page->mapping->host;
- loff_t end, i_size = i_size_read(inode);
- pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ loff_t end, i_size;
+ pgoff_t end_index;
+ spin_lock(&inode->i_lock);
+ i_size = i_size_read(inode);
+ end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
if (i_size > 0 && page->index < end_index)
- return;
+ goto out;
end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
if (i_size >= end)
- return;
- nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+ goto out;
i_size_write(inode, end);
+ nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+ spin_unlock(&inode->i_lock);
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
SetPageUptodate(page);
}
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
-{
- struct nfs_page *req;
- int ret;
-
- for (;;) {
- req = nfs_update_request(ctx, page, offset, count);
- if (!IS_ERR(req))
- break;
- ret = PTR_ERR(req);
- if (ret != -EBUSY)
- return ret;
- ret = nfs_wb_page(page->mapping->host, page);
- if (ret != 0)
- return ret;
- }
- /* Update file length */
- nfs_grow_file(page, offset, count);
- nfs_clear_page_tag_locked(req);
- return 0;
-}
-
static int wb_priority(struct writeback_control *wbc)
{
if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
return ret;
spin_lock(&inode->i_lock);
}
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
- /* This request is marked for commit */
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
spin_unlock(&inode->i_lock);
- nfs_clear_page_tag_locked(req);
- nfs_pageio_complete(pgio);
- return 0;
+ BUG();
}
if (nfs_set_page_writeback(page) != 0) {
spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
/*
* Insert a write request into an inode
*/
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(inode);
int error;
+ error = radix_tree_preload(GFP_NOFS);
+ if (error != 0)
+ goto out;
+
+ /* Lock the request! */
+ nfs_lock_request_dontget(req);
+
+ spin_lock(&inode->i_lock);
error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
BUG_ON(error);
if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
kref_get(&req->wb_kref);
radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
NFS_PAGE_TAG_LOCKED);
+ spin_unlock(&inode->i_lock);
+ radix_tree_preload_end();
+out:
+ return error;
}
/*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
__set_page_dirty_nobuffers(req->wb_page);
}
-/*
- * Check if a request is dirty
- */
-static inline int
-nfs_dirty_request(struct nfs_page *req)
-{
- struct page *page = req->wb_page;
-
- if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
- return 0;
- return !PageWriteback(page);
-}
-
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
/*
* Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
spin_lock(&inode->i_lock);
nfsi->ncommit++;
- set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+ set_bit(PG_CLEAN, &(req)->wb_flags);
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+ struct page *page = req->wb_page;
+
+ if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+ dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+ return 1;
+ }
+ return 0;
+}
+
static inline
int nfs_write_need_commit(struct nfs_write_data *data)
{
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
static inline
int nfs_reschedule_unstable_write(struct nfs_page *req)
{
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+ if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
nfs_mark_request_commit(req);
return 1;
}
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
{
}
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+ return 0;
+}
+
static inline
int nfs_write_need_commit(struct nfs_write_data *data)
{
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
nfs_list_remove_request(req);
- clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+ nfs_clear_request_commit(req);
nfs_inode_remove_request(req);
nfs_unlock_request(req);
}
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
#endif
/*
- * Try to update any existing write request, or create one if there is none.
- * In order to match, the request's credentials must match those of
- * the calling process.
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
*
- * Note: Should always be called with the Page Lock held!
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
*/
-static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
- struct page *page, unsigned int offset, unsigned int bytes)
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ struct page *page,
+ unsigned int offset,
+ unsigned int bytes)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct nfs_page *req, *new = NULL;
- pgoff_t rqend, end;
+ struct nfs_page *req;
+ unsigned int rqend;
+ unsigned int end;
+ int error;
+
+ if (!PagePrivate(page))
+ return NULL;
end = offset + bytes;
+ spin_lock(&inode->i_lock);
for (;;) {
- /* Loop over all inode entries and see if we find
- * A request for the page we wish to update
+ req = nfs_page_find_request_locked(page);
+ if (req == NULL)
+ goto out_unlock;
+
+ rqend = req->wb_offset + req->wb_bytes;
+ /*
+ * Tell the caller to flush out the request if
+ * the offsets are non-contiguous.
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
*/
- if (new) {
- if (radix_tree_preload(GFP_NOFS)) {
- nfs_release_request(new);
- return ERR_PTR(-ENOMEM);
- }
- }
+ if (offset > rqend
+ || end < req->wb_offset)
+ goto out_flushme;
- spin_lock(&inode->i_lock);
- req = nfs_page_find_request_locked(page);
- if (req) {
- if (!nfs_set_page_tag_locked(req)) {
- int error;
-
- spin_unlock(&inode->i_lock);
- error = nfs_wait_on_request(req);
- nfs_release_request(req);
- if (error < 0) {
- if (new) {
- radix_tree_preload_end();
- nfs_release_request(new);
- }
- return ERR_PTR(error);
- }
- continue;
- }
- spin_unlock(&inode->i_lock);
- if (new) {
- radix_tree_preload_end();
- nfs_release_request(new);
- }
+ if (nfs_set_page_tag_locked(req))
break;
- }
- if (new) {
- nfs_lock_request_dontget(new);
- nfs_inode_add_request(inode, new);
- spin_unlock(&inode->i_lock);
- radix_tree_preload_end();
- req = new;
- goto zero_page;
- }
+ /* The request is locked, so wait and then retry */
spin_unlock(&inode->i_lock);
-
- new = nfs_create_request(ctx, inode, page, offset, bytes);
- if (IS_ERR(new))
- return new;
+ error = nfs_wait_on_request(req);
+ nfs_release_request(req);
+ if (error != 0)
+ goto out_err;
+ spin_lock(&inode->i_lock);
}
- /* We have a request for our page.
- * If the creds don't match, or the
- * page addresses don't match,
- * tell the caller to wait on the conflicting
- * request.
- */
- rqend = req->wb_offset + req->wb_bytes;
- if (req->wb_context != ctx
- || req->wb_page != page
- || !nfs_dirty_request(req)
- || offset > rqend || end < req->wb_offset) {
- nfs_clear_page_tag_locked(req);
- return ERR_PTR(-EBUSY);
- }
+ if (nfs_clear_request_commit(req))
+ radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+ req->wb_index, NFS_PAGE_TAG_COMMIT);
/* Okay, the request matches. Update the region */
if (offset < req->wb_offset) {
req->wb_offset = offset;
req->wb_pgbase = offset;
- req->wb_bytes = max(end, rqend) - req->wb_offset;
- goto zero_page;
}
-
if (end > rqend)
req->wb_bytes = end - req->wb_offset;
-
+ else
+ req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+ spin_unlock(&inode->i_lock);
return req;
-zero_page:
- /* If this page might potentially be marked as up to date,
- * then we need to zero any uninitalised data. */
- if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
- && !PageUptodate(req->wb_page))
- zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
+out_flushme:
+ spin_unlock(&inode->i_lock);
+ nfs_release_request(req);
+ error = nfs_wb_page(inode, page);
+out_err:
+ return ERR_PTR(error);
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+ struct page *page, unsigned int offset, unsigned int bytes)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_page *req;
+ int error;
+
+ req = nfs_try_to_update_request(inode, page, offset, bytes);
+ if (req != NULL)
+ goto out;
+ req = nfs_create_request(ctx, inode, page, offset, bytes);
+ if (IS_ERR(req))
+ goto out;
+ error = nfs_inode_add_request(inode, req);
+ if (error != 0) {
+ nfs_release_request(req);
+ req = ERR_PTR(error);
+ }
+out:
return req;
}
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_page *req;
+
+ req = nfs_setup_write_request(ctx, page, offset, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ /* Update file length */
+ nfs_grow_file(page, offset, count);
+ nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_clear_page_tag_locked(req);
+ return 0;
+}
+
int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
req = nfs_page_find_request(page);
if (req == NULL)
return 0;
- do_flush = req->wb_page != page || req->wb_context != ctx
- || !nfs_dirty_request(req);
+ do_flush = req->wb_page != page || req->wb_context != ctx;
nfs_release_request(req);
if (!do_flush)
return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
- dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n",
+ dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name, count,
- (long long)(page_offset(page) +offset));
+ (long long)(page_offset(page) + offset));
/* If we're not using byte range locks, and we know the page
* is up to date, it may be more efficient to extend the write
@@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page,
else
__set_page_dirty_nobuffers(page);
- dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n",
+ dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
}
@@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page,
static void nfs_writepage_release(struct nfs_page *req)
{
- if (PageError(req->wb_page)) {
- nfs_end_page_writeback(req->wb_page);
- nfs_inode_remove_request(req);
- } else if (!nfs_reschedule_unstable_write(req)) {
- /* Set the PG_uptodate flag */
- nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
+ if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
nfs_end_page_writeback(req->wb_page);
nfs_inode_remove_request(req);
} else
@@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
NFS_PROTO(inode)->write_setup(data, &msg);
dprintk("NFS: %5u initiated write call "
- "(req %s/%Ld, %u bytes @ offset %Lu)\n",
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
data->task.tk_pid,
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
@@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
- struct nfs_page *req = data->req;
- dprintk("NFS: write (%s/%Ld %d@%Ld)",
- req->wb_context->path.dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
- req->wb_bytes,
- (long long)req_offset(req));
+ dprintk("NFS: %5u write(%s/%lld %d@%lld)",
+ task->tk_pid,
+ data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+ (long long)
+ NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+ data->req->wb_bytes, (long long)req_offset(data->req));
nfs_writeback_done(task, data);
}
@@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
nfs_list_remove_request(req);
- dprintk("NFS: write (%s/%Ld %d@%Ld)",
+ dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+ data->task.tk_pid,
req->wb_context->path.dentry->d_inode->i_sb->s_id,
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
@@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
dprintk(" marked for commit\n");
goto next;
}
- /* Set the PG_uptodate flag? */
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
dprintk(" OK\n");
remove_request:
nfs_end_page_writeback(page);
@@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
static unsigned long complain;
if (time_before(complain, jiffies)) {
- dprintk("NFS: faulty NFS server %s:"
+ dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(data->inode)->nfs_client->cl_hostname,
resp->verf->committed, argp->stable);
@@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
+ nfs_clear_request_commit(req);
- dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+ dprintk("NFS: commit (%s/%lld %d@%lld)",
req->wb_context->path.dentry->d_inode->i_sb->s_id,
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
@@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
* returned by the server against all stored verfs. */
if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
/* We have a match */
- /* Set the PG_uptodate flag */
- nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
- req->wb_bytes);
nfs_inode_remove_request(req);
dprintk(" OK\n");
goto next;
@@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
req = nfs_page_find_request(page);
if (req == NULL)
goto out;
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
nfs_release_request(req);
break;
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c..702fa577aa6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -381,7 +381,7 @@ static int do_probe_callback(void *data)
.program = &cb_program,
.version = nfs_cb_version[1]->number,
.authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
- .flags = (RPC_CLNT_CREATE_NOPING),
+ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0505a..1db080135c6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -174,10 +174,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
* need to use BH_New is when we're extending i_size on a file
* system which doesn't support holes, in which case BH_New
* allows block_prepare_write() to zero.
+ *
+ * If we see this on a sparse file system, then a truncate has
+ * raced us and removed the cluster. In this case, we clear
+ * the buffers dirty and uptodate bits and let the buffer code
+ * ignore it as a hole.
*/
- mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
- "ino %lu, iblock %llu\n", inode->i_ino,
- (unsigned long long)iblock);
+ if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+ clear_buffer_dirty(bh_result);
+ clear_buffer_uptodate(bh_result);
+ goto bail;
+ }
/* Treat the unwritten extent as a hole for zeroing purposes. */
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb34604..443d108211a 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1489,25 +1489,28 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
-static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
- const char *name)
+static int o2hb_heartbeat_group_make_item(struct config_group *group,
+ const char *name,
+ struct config_item **new_item)
{
struct o2hb_region *reg = NULL;
- struct config_item *ret = NULL;
+ int ret = 0;
reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
- if (reg == NULL)
- goto out; /* ENOMEM */
+ if (reg == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
- ret = &reg->hr_item;
+ *new_item = &reg->hr_item;
spin_lock(&o2hb_live_lock);
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
out:
- if (ret == NULL)
+ if (ret)
kfree(reg);
return ret;
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7bf3c0ea7bd..d8bfa0eb41b 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -146,8 +146,10 @@ static int nst_seq_show(struct seq_file *seq, void *v)
nst->st_task->comm, nst->st_node,
nst->st_sc, nst->st_id, nst->st_msg_type,
nst->st_msg_key,
- nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
- nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+ nst->st_sock_time.tv_sec,
+ (unsigned long)nst->st_sock_time.tv_usec,
+ nst->st_send_time.tv_sec,
+ (unsigned long)nst->st_send_time.tv_usec,
nst->st_status_time.tv_sec,
nst->st_status_time.tv_usec);
}
@@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return sc; /* unused, just needs to be null when done */
}
-#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
static int sc_seq_show(struct seq_file *seq, void *v)
{
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b484e..b364b7052e4 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,27 +644,32 @@ out:
return ret;
}
-static struct config_item *o2nm_node_group_make_item(struct config_group *group,
- const char *name)
+static int o2nm_node_group_make_item(struct config_group *group,
+ const char *name,
+ struct config_item **new_item)
{
struct o2nm_node *node = NULL;
- struct config_item *ret = NULL;
+ int ret = 0;
- if (strlen(name) > O2NM_MAX_NAME_LEN)
- goto out; /* ENAMETOOLONG */
+ if (strlen(name) > O2NM_MAX_NAME_LEN) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
- if (node == NULL)
- goto out; /* ENOMEM */
+ if (node == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
- ret = &node->nd_item;
+ *new_item = &node->nd_item;
out:
- if (ret == NULL)
+ if (ret)
kfree(node);
return ret;
@@ -751,25 +756,31 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
}
#endif
-static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
- const char *name)
+static int o2nm_cluster_group_make_group(struct config_group *group,
+ const char *name,
+ struct config_group **new_group)
{
struct o2nm_cluster *cluster = NULL;
struct o2nm_node_group *ns = NULL;
- struct config_group *o2hb_group = NULL, *ret = NULL;
+ struct config_group *o2hb_group = NULL;
void *defs = NULL;
+ int ret = 0;
/* this runs under the parent dir's i_mutex; there can be only
* one caller in here at a time */
- if (o2nm_single_cluster)
- goto out; /* ENOSPC */
+ if (o2nm_single_cluster) {
+ ret = -ENOSPC;
+ goto out;
+ }
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
o2hb_group = o2hb_alloc_hb_set();
- if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+ if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
+ ret = -ENOMEM;
goto out;
+ }
config_group_init_type_name(&cluster->cl_group, name,
&o2nm_cluster_type);
@@ -786,11 +797,11 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
- ret = &cluster->cl_group;
+ *new_group = &cluster->cl_group;
o2nm_single_cluster = cluster;
out:
- if (ret == NULL) {
+ if (ret) {
kfree(cluster);
kfree(ns);
o2hb_free_hb_set(o2hb_group);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128..44f87caf368 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
+ spin_lock(&dlm->spinlock);
list_add_tail(&res->tracking, &dlm->tracking_list);
+ spin_unlock(&dlm->spinlock);
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a..eae3d643a5e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -31,6 +31,7 @@
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/time.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -59,6 +60,9 @@ struct ocfs2_mask_waiter {
struct completion mw_complete;
unsigned long mw_mask;
unsigned long mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+ unsigned long long mw_lock_start;
+#endif
};
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
@@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
spin_unlock(&ocfs2_dlm_tracking_lock);
}
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+ res->l_lock_num_prmode = 0;
+ res->l_lock_num_prmode_failed = 0;
+ res->l_lock_total_prmode = 0;
+ res->l_lock_max_prmode = 0;
+ res->l_lock_num_exmode = 0;
+ res->l_lock_num_exmode_failed = 0;
+ res->l_lock_total_exmode = 0;
+ res->l_lock_max_exmode = 0;
+ res->l_lock_refresh = 0;
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+ struct ocfs2_mask_waiter *mw, int ret)
+{
+ unsigned long long *num, *sum;
+ unsigned int *max, *failed;
+ struct timespec ts = current_kernel_time();
+ unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
+
+ if (level == LKM_PRMODE) {
+ num = &res->l_lock_num_prmode;
+ sum = &res->l_lock_total_prmode;
+ max = &res->l_lock_max_prmode;
+ failed = &res->l_lock_num_prmode_failed;
+ } else if (level == LKM_EXMODE) {
+ num = &res->l_lock_num_exmode;
+ sum = &res->l_lock_total_exmode;
+ max = &res->l_lock_max_exmode;
+ failed = &res->l_lock_num_exmode_failed;
+ } else
+ return;
+
+ (*num)++;
+ (*sum) += time;
+ if (time > *max)
+ *max = time;
+ if (ret)
+ (*failed)++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+ lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+ struct timespec ts = current_kernel_time();
+ mw->mw_lock_start = timespec_to_ns(&ts);
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+ int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
@@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
res->l_flags = OCFS2_LOCK_INITIALIZED;
ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+ ocfs2_init_lock_stats(res);
}
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
{
INIT_LIST_HEAD(&mw->mw_item);
init_completion(&mw->mw_complete);
+ ocfs2_init_start_time(mw);
}
static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
@@ -1254,6 +1330,7 @@ out:
goto again;
mlog_errno(ret);
}
+ ocfs2_update_lock_stats(lockres, level, &mw, ret);
mlog_exit(ret);
return ret;
@@ -1554,8 +1631,8 @@ out:
*/
int ocfs2_file_lock(struct file *file, int ex, int trylock)
{
- int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
- unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+ int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
unsigned long flags;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1659,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
* Get the lock at NLMODE to start - that way we
* can cancel the upconvert request if need be.
*/
- ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+ ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1597,7 +1674,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
}
lockres->l_action = OCFS2_AST_CONVERT;
- lkm_flags |= LKM_CONVERT;
+ lkm_flags |= DLM_LKF_CONVERT;
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1664,7 +1741,7 @@ void ocfs2_file_unlock(struct file *file)
if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
return;
- if (lockres->l_level == LKM_NLMODE)
+ if (lockres->l_level == DLM_LOCK_NL)
return;
mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1755,11 @@ void ocfs2_file_unlock(struct file *file)
lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
lockres->l_blocking = DLM_LOCK_EX;
- gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+ gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+ ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
if (ret) {
mlog_errno(ret);
return;
@@ -1983,6 +2060,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
le32_to_cpu(fe->i_flags));
ocfs2_refresh_inode(inode, fe);
+ ocfs2_track_lock_refresh(lockres);
}
status = 0;
@@ -2267,6 +2345,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
if (status < 0)
mlog_errno(status);
+ ocfs2_track_lock_refresh(lockres);
}
bail:
mlog_exit(status);
@@ -2461,7 +2540,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
}
/* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 1
+#define OCFS2_DLM_DEBUG_STR_VERSION 2
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
@@ -2502,6 +2581,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
for(i = 0; i < DLM_LVB_LEN; i++)
seq_printf(m, "0x%x\t", lvb[i]);
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
+# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
+# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
+# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
+# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
+# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
+# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
+# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
+# define lock_refresh(_l) (_l)->l_lock_refresh
+#else
+# define lock_num_prmode(_l) (0ULL)
+# define lock_num_exmode(_l) (0ULL)
+# define lock_num_prmode_failed(_l) (0)
+# define lock_num_exmode_failed(_l) (0)
+# define lock_total_prmode(_l) (0ULL)
+# define lock_total_exmode(_l) (0ULL)
+# define lock_max_prmode(_l) (0)
+# define lock_max_exmode(_l) (0)
+# define lock_refresh(_l) (0)
+#endif
+ /* The following seq_print was added in version 2 of this output */
+ seq_printf(m, "%llu\t"
+ "%llu\t"
+ "%u\t"
+ "%u\t"
+ "%llu\t"
+ "%llu\t"
+ "%u\t"
+ "%u\t"
+ "%u\t",
+ lock_num_prmode(lockres),
+ lock_num_exmode(lockres),
+ lock_num_prmode_failed(lockres),
+ lock_num_exmode_failed(lockres),
+ lock_total_prmode(lockres),
+ lock_total_exmode(lockres),
+ lock_max_prmode(lockres),
+ lock_max_exmode(lockres),
+ lock_refresh(lockres));
+
/* End the line */
seq_printf(m, "\n");
return 0;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30cde9..e8514e8b6ce 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2202,7 +2202,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
if (ret == -EINVAL)
- mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+ mlog(0, "generic_file_aio_read returned -EINVAL\n");
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9698338adc3..a8c19cb3cfd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -329,7 +329,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
#else
status = journal_extend(handle, nblocks);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index be774bdc8b3..28e492e4ec8 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -498,7 +498,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
ocfs2_error(osb->sb, "local alloc inode %llu says it has "
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 31692379c17..1cb814be8ef 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -132,6 +132,18 @@ struct ocfs2_lock_res {
wait_queue_head_t l_event;
struct list_head l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+ unsigned long long l_lock_num_prmode; /* PR acquires */
+ unsigned long long l_lock_num_exmode; /* EX acquires */
+ unsigned int l_lock_num_prmode_failed; /* Failed PR gets */
+ unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
+ unsigned long long l_lock_total_prmode; /* Tot wait for PR */
+ unsigned long long l_lock_total_exmode; /* Tot wait for EX */
+ unsigned int l_lock_max_prmode; /* Max wait for PR */
+ unsigned int l_lock_max_exmode; /* Max wait for EX */
+ unsigned int l_lock_refresh; /* Disk refreshes */
+#endif
};
struct ocfs2_dlm_debug {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c42666515..3f194517762 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -901,7 +901,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
* list has a copy per slot.
*/
if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
- chars = snprintf(buf, len,
+ chars = snprintf(buf, len, "%s",
ocfs2_system_inodes[type].si_name);
else
chars = snprintf(buf, len,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd46..353fc35c674 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
+#include <linux/smp_lock.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
@@ -549,26 +550,17 @@ static ssize_t ocfs2_control_read(struct file *file,
size_t count,
loff_t *ppos)
{
- char *proto_string = OCFS2_CONTROL_PROTO;
- size_t to_write = 0;
-
- if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
- return 0;
-
- to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
- if (to_write > count)
- to_write = count;
- if (copy_to_user(buf, proto_string + *ppos, to_write))
- return -EFAULT;
+ ssize_t ret;
- *ppos += to_write;
+ ret = simple_read_from_buffer(buf, count, ppos,
+ OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
/* Have we read the whole protocol list? */
- if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+ if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
ocfs2_control_set_handshake_state(file,
OCFS2_CONTROL_HANDSHAKE_READ);
- return to_write;
+ return ret;
}
static int ocfs2_control_release(struct inode *inode, struct file *file)
@@ -619,10 +611,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
+ lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
+ unlock_kernel();
return 0;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba20ae9..ccecfe5094f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1703,7 +1703,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
local = ocfs2_mount_local(osb);
/* will play back anything left in the journal. */
- ocfs2_journal_load(osb->journal, local);
+ status = ocfs2_journal_load(osb->journal, local);
+ if (status < 0) {
+ mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+ goto finally;
+ }
if (dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
diff --git a/fs/open.c b/fs/open.c
index a1450086e92..a99ad09c319 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
+#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
{
struct nameidata nd;
int old_fsuid, old_fsgid;
- kernel_cap_t old_cap;
+ kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */
int res;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
old_fsuid = current->fsuid;
old_fsgid = current->fsgid;
- old_cap = current->cap_effective;
current->fsuid = current->uid;
current->fsgid = current->gid;
- /*
- * Clear the capabilities if we switch to a non-root user
- *
- * FIXME: There is a race here against sys_capset. The
- * capabilities can change yet we will restore the old
- * value below. We should hold task_capabilities_lock,
- * but we cannot because user_path_walk can sleep.
- */
- if (current->uid)
- cap_clear(current->cap_effective);
- else
- current->cap_effective = current->cap_permitted;
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ /*
+ * Clear the capabilities if we switch to a non-root user
+ */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+ /*
+ * FIXME: There is a race here against sys_capset. The
+ * capabilities can change yet we will restore the old
+ * value below. We should hold task_capabilities_lock,
+ * but we cannot because user_path_walk can sleep.
+ */
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+ if (current->uid)
+ old_cap = cap_set_effective(__cap_empty_set);
+ else
+ old_cap = cap_set_effective(current->cap_permitted);
+ }
res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
if (res)
@@ -478,7 +483,9 @@ out_path_release:
out:
current->fsuid = old_fsuid;
current->fsgid = old_fsgid;
- current->cap_effective = old_cap;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP))
+ cap_set_effective(old_cap);
return res;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7f..58c3e6a8e15 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
*/
if (task->parent == current && (task->ptrace & PT_PTRACED) &&
task_is_stopped_or_traced(task) &&
- ptrace_may_attach(task))
+ ptrace_may_access(task, PTRACE_MODE_ATTACH))
return 0;
/*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
task_lock(task);
if (task->mm != mm)
goto out;
- if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+ if (task->mm != current->mm &&
+ __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
goto out;
task_unlock(task);
return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_attach(task);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ);
put_task_struct(task);
}
return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!task)
goto out_no_task;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out;
ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad46..c652d469dc0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+ return 0;
+}
+
static int meminfo_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
len += hugetlb_report_meminfo(page + len);
+ len += arch_report_meminfo(page + len);
+
return proc_calc_metrics(page, start, off, count, eof, len);
#undef K
}
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
};
#endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
static int show_stat(struct seq_file *p, void *v)
{
int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
sum += temp;
per_irq_sum[j] += temp;
}
+ sum += arch_irq_stat_cpu(i);
}
+ sum += arch_irq_stat();
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ab8ccc9d14f..164bd9f9ede 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
dev_t dev = 0;
int len;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
if (file) {
@@ -476,10 +476,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
- static struct mm_walk clear_refs_walk;
- memset(&clear_refs_walk, 0, sizeof(clear_refs_walk));
- clear_refs_walk.pmd_entry = clear_refs_pte_range;
- clear_refs_walk.mm = mm;
+ struct mm_walk clear_refs_walk = {
+ .pmd_entry = clear_refs_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
clear_refs_walk.private = vma;
@@ -602,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
-static struct mm_walk pagemap_walk = {
- .pmd_entry = pagemap_pte_range,
- .pte_hole = pagemap_pte_hole
-};
-
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
*
@@ -641,12 +636,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
struct pagemapread pm;
int pagecount;
int ret = -ESRCH;
+ struct mm_walk pagemap_walk;
+ unsigned long src;
+ unsigned long svpfn;
+ unsigned long start_vaddr;
+ unsigned long end_vaddr;
if (!task)
goto out;
ret = -EACCES;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_task;
ret = -EINVAL;
@@ -659,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!mm)
goto out_task;
- ret = -ENOMEM;
+
uaddr = (unsigned long)buf & PAGE_MASK;
uend = (unsigned long)(buf + count);
pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
- pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+ ret = 0;
+ if (pagecount == 0)
+ goto out_mm;
+ pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+ ret = -ENOMEM;
if (!pages)
goto out_mm;
@@ -684,33 +688,33 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
pm.out = (u64 *)buf;
pm.end = (u64 *)(buf + count);
- if (!ptrace_may_attach(task)) {
- ret = -EIO;
- } else {
- unsigned long src = *ppos;
- unsigned long svpfn = src / PM_ENTRY_BYTES;
- unsigned long start_vaddr = svpfn << PAGE_SHIFT;
- unsigned long end_vaddr = TASK_SIZE_OF(task);
-
- /* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
- start_vaddr = end_vaddr;
-
- /*
- * The odds are that this will stop walking way
- * before end_vaddr, because the length of the
- * user buffer is tracked in "pm", and the walk
- * will stop when we hit the end of the buffer.
- */
- ret = walk_page_range(start_vaddr, end_vaddr,
- &pagemap_walk);
- if (ret == PM_END_OF_BUFFER)
- ret = 0;
- /* don't need mmap_sem for these, but this looks cleaner */
- *ppos += (char *)pm.out - buf;
- if (!ret)
- ret = (char *)pm.out - buf;
- }
+ pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pte_hole = pagemap_pte_hole;
+ pagemap_walk.mm = mm;
+ pagemap_walk.private = &pm;
+
+ src = *ppos;
+ svpfn = src / PM_ENTRY_BYTES;
+ start_vaddr = svpfn << PAGE_SHIFT;
+ end_vaddr = TASK_SIZE_OF(task);
+
+ /* watch out for wraparound */
+ if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ start_vaddr = end_vaddr;
+
+ /*
+ * The odds are that this will stop walking way
+ * before end_vaddr, because the length of the
+ * user buffer is tracked in "pm", and the walk
+ * will stop when we hit the end of the buffer.
+ */
+ ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+ if (ret == PM_END_OF_BUFFER)
+ ret = 0;
+ /* don't need mmap_sem for these, but this looks cleaner */
+ *ppos += (char *)pm.out - buf;
+ if (!ret)
+ ret = (char *)pm.out - buf;
out_pages:
for (; pagecount; pagecount--) {
@@ -743,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f18..5d84e7121df 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
- if (maps_protect && !ptrace_may_attach(task))
+ if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
return -EACCES;
return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b902430..78f613cb9c7 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f6..52312ec93ff 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
.aio_write = generic_file_aio_write,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c6..9ba495d5a29 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
{
loff_t retval;
struct inode *inode = file->f_mapping->host;
- mutex_lock(&inode->i_mutex);
switch (origin) {
case SEEK_END:
offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
}
retval = -EINVAL;
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+ /* Special lock needed here? */
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
- mutex_unlock(&inode->i_mutex);
return retval;
}
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- loff_t retval;
-
- lock_kernel();
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(file->f_path.dentry->d_inode);
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
- unlock_kernel();
- return retval;
+ loff_t n;
+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
+ n = generic_file_llseek_unlocked(file, offset, origin);
+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+ return n;
}
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212..192269698a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
goto out;
reiserfs_update_inode_transaction(inode);
+ reiserfs_discard_prealloc(&th, inode);
+
err = reiserfs_delete_object(&th, inode);
/* Do quota update inside a transaction for journaled quotas. We must do that
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e6..1d40f2bd197 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
inode->i_version++;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7..2294783320c 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
return error;
}
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations smb_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = smb_remote_llseek,
.read = do_sync_read,
.aio_read = smb_file_aio_read,
.write = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b30..399442179d8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
lock_page(page);
/*
- * page was truncated, stop here. if this isn't the
- * first page, we'll just complete what we already
- * added
+ * Page was truncated, or invalidated by the
+ * filesystem. Redo the find/create, but this time the
+ * page is kept locked, so there's no chance of another
+ * race with truncate/invalidate.
*/
if (!page->mapping) {
unlock_page(page);
- break;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+
+ if (!page) {
+ error = -ENOMEM;
+ break;
+ }
+ page_cache_release(pages[page_nr]);
+ pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 00000000000..91ceeda7e5b
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,72 @@
+config UBIFS_FS
+ tristate "UBIFS file system support"
+ select CRC16
+ select CRC32
+ select CRYPTO if UBIFS_FS_ADVANCED_COMPR
+ select CRYPTO if UBIFS_FS_LZO
+ select CRYPTO if UBIFS_FS_ZLIB
+ select CRYPTO_LZO if UBIFS_FS_LZO
+ select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+ depends on MTD_UBI
+ help
+ UBIFS is a file system for flash devices which works on top of UBI.
+
+config UBIFS_FS_XATTR
+ bool "Extended attributes support"
+ depends on UBIFS_FS
+ help
+ This option enables support of extended attributes.
+
+config UBIFS_FS_ADVANCED_COMPR
+ bool "Advanced compression options"
+ depends on UBIFS_FS
+ help
+ This option allows to explicitly choose which compressions, if any,
+ are enabled in UBIFS. Removing compressors means inbility to read
+ existing file systems.
+
+ If unsure, say 'N'.
+
+config UBIFS_FS_LZO
+ bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
+ depends on UBIFS_FS
+ default y
+ help
+ LZO compressor is generally faster then zlib but compresses worse.
+ Say 'Y' if unsure.
+
+config UBIFS_FS_ZLIB
+ bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
+ depends on UBIFS_FS
+ default y
+ help
+ Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+
+# Debugging-related stuff
+config UBIFS_FS_DEBUG
+ bool "Enable debugging"
+ depends on UBIFS_FS
+ select DEBUG_FS
+ select KALLSYMS_ALL
+ help
+ This option enables UBIFS debugging.
+
+config UBIFS_FS_DEBUG_MSG_LVL
+ int "Default message level (0 = no extra messages, 3 = lots)"
+ depends on UBIFS_FS_DEBUG
+ default "0"
+ help
+ This controls the amount of debugging messages produced by UBIFS.
+ If reporting bugs, please try to have available a full dump of the
+ messages at level 1 while the misbehaviour was occurring. Level 2
+ may become necessary if level 1 messages were not enough to find the
+ bug. Generally Level 3 should be avoided.
+
+config UBIFS_FS_DEBUG_CHKS
+ bool "Enable extra checks"
+ depends on UBIFS_FS_DEBUG
+ help
+ If extra checks are enabled UBIFS will check the consistency of its
+ internal data structures during operation. However, UBIFS performance
+ is dramatically slower when this option is selected especially if the
+ file system is large.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 00000000000..80e93c35e49
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_UBIFS_FS) += ubifs.o
+
+ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
+ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
+ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
+
+ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
+ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 00000000000..d81fb9ed2b8
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,731 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the budgeting sub-system which is responsible for UBIFS
+ * space management.
+ *
+ * Factors such as compression, wasted space at the ends of LEBs, space in other
+ * journal heads, the effect of updates on the index, and so on, make it
+ * impossible to accurately predict the amount of space needed. Consequently
+ * approximations are used.
+ */
+
+#include "ubifs.h"
+#include <linux/writeback.h>
+#include <asm/div64.h>
+
+/*
+ * When pessimistic budget calculations say that there is no enough space,
+ * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
+ * or committing. The below constants define maximum number of times UBIFS
+ * repeats the operations.
+ */
+#define MAX_SHRINK_RETRIES 8
+#define MAX_GC_RETRIES 4
+#define MAX_CMT_RETRIES 2
+#define MAX_NOSPC_RETRIES 1
+
+/*
+ * The below constant defines amount of dirty pages which should be written
+ * back at when trying to shrink the liability.
+ */
+#define NR_TO_WRITE 16
+
+/**
+ * struct retries_info - information about re-tries while making free space.
+ * @prev_liability: previous liability
+ * @shrink_cnt: how many times the liability was shrinked
+ * @shrink_retries: count of liability shrink re-tries (increased when
+ * liability does not shrink)
+ * @try_gc: GC should be tried first
+ * @gc_retries: how many times GC was run
+ * @cmt_retries: how many times commit has been done
+ * @nospc_retries: how many times GC returned %-ENOSPC
+ *
+ * Since we consider budgeting to be the fast-path, and this structure has to
+ * be allocated on stack and zeroed out, we make it smaller using bit-fields.
+ */
+struct retries_info {
+ long long prev_liability;
+ unsigned int shrink_cnt;
+ unsigned int shrink_retries:5;
+ unsigned int try_gc:1;
+ unsigned int gc_retries:4;
+ unsigned int cmt_retries:3;
+ unsigned int nospc_retries:1;
+};
+
+/**
+ * shrink_liability - write-back some dirty pages/inodes.
+ * @c: UBIFS file-system description object
+ * @nr_to_write: how many dirty pages to write-back
+ *
+ * This function shrinks UBIFS liability by means of writing back some amount
+ * of dirty inodes and their pages. Returns the amount of pages which were
+ * written back. The returned value does not include dirty inodes which were
+ * synchronized.
+ *
+ * Note, this function synchronizes even VFS inodes which are locked
+ * (@i_mutex) by the caller of the budgeting function, because write-back does
+ * not touch @i_mutex.
+ */
+static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+{
+ int nr_written;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .range_end = LLONG_MAX,
+ .nr_to_write = nr_to_write,
+ };
+
+ generic_sync_sb_inodes(c->vfs_sb, &wbc);
+ nr_written = nr_to_write - wbc.nr_to_write;
+
+ if (!nr_written) {
+ /*
+ * Re-try again but wait on pages/inodes which are being
+ * written-back concurrently (e.g., by pdflush).
+ */
+ memset(&wbc, 0, sizeof(struct writeback_control));
+ wbc.sync_mode = WB_SYNC_ALL;
+ wbc.range_end = LLONG_MAX;
+ wbc.nr_to_write = nr_to_write;
+ generic_sync_sb_inodes(c->vfs_sb, &wbc);
+ nr_written = nr_to_write - wbc.nr_to_write;
+ }
+
+ dbg_budg("%d pages were written back", nr_written);
+ return nr_written;
+}
+
+
+/**
+ * run_gc - run garbage collector.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs garbage collector to make some more free space. Returns
+ * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
+ * negative error code in case of failure.
+ */
+static int run_gc(struct ubifs_info *c)
+{
+ int err, lnum;
+
+ /* Make some free space by garbage-collecting dirty space */
+ down_read(&c->commit_sem);
+ lnum = ubifs_garbage_collect(c, 1);
+ up_read(&c->commit_sem);
+ if (lnum < 0)
+ return lnum;
+
+ /* GC freed one LEB, return it to lprops */
+ dbg_budg("GC freed LEB %d", lnum);
+ err = ubifs_return_leb(c, lnum);
+ if (err)
+ return err;
+ return 0;
+}
+
+/**
+ * make_free_space - make more free space on the file-system.
+ * @c: UBIFS file-system description object
+ * @ri: information about previous invocations of this function
+ *
+ * This function is called when an operation cannot be budgeted because there
+ * is supposedly no free space. But in most cases there is some free space:
+ * o budgeting is pessimistic, so it always budgets more then it is actually
+ * needed, so shrinking the liability is one way to make free space - the
+ * cached data will take less space then it was budgeted for;
+ * o GC may turn some dark space into free space (budgeting treats dark space
+ * as not available);
+ * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
+ *
+ * So this function tries to do the above. Returns %-EAGAIN if some free space
+ * was presumably made and the caller has to re-try budgeting the operation.
+ * Returns %-ENOSPC if it couldn't do more free space, and other negative error
+ * codes on failures.
+ */
+static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+{
+ int err;
+
+ /*
+ * If we have some dirty pages and inodes (liability), try to write
+ * them back unless this was tried too many times without effect
+ * already.
+ */
+ if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
+ long long liability;
+
+ spin_lock(&c->space_lock);
+ liability = c->budg_idx_growth + c->budg_data_growth +
+ c->budg_dd_growth;
+ spin_unlock(&c->space_lock);
+
+ if (ri->prev_liability >= liability) {
+ /* Liability does not shrink, next time try GC then */
+ ri->shrink_retries += 1;
+ if (ri->gc_retries < MAX_GC_RETRIES)
+ ri->try_gc = 1;
+ dbg_budg("liability did not shrink: retries %d of %d",
+ ri->shrink_retries, MAX_SHRINK_RETRIES);
+ }
+
+ dbg_budg("force write-back (count %d)", ri->shrink_cnt);
+ shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+
+ ri->prev_liability = liability;
+ ri->shrink_cnt += 1;
+ return -EAGAIN;
+ }
+
+ /*
+ * Try to run garbage collector unless it was already tried too many
+ * times.
+ */
+ if (ri->gc_retries < MAX_GC_RETRIES) {
+ ri->gc_retries += 1;
+ dbg_budg("run GC, retries %d of %d",
+ ri->gc_retries, MAX_GC_RETRIES);
+
+ ri->try_gc = 0;
+ err = run_gc(c);
+ if (!err)
+ return -EAGAIN;
+
+ if (err == -EAGAIN) {
+ dbg_budg("GC asked to commit");
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ return -EAGAIN;
+ }
+
+ if (err != -ENOSPC)
+ return err;
+
+ /*
+ * GC could not make any progress. If this is the first time,
+ * then it makes sense to try to commit, because it might make
+ * some dirty space.
+ */
+ dbg_budg("GC returned -ENOSPC, retries %d",
+ ri->nospc_retries);
+ if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
+ return err;
+ ri->nospc_retries += 1;
+ }
+
+ /* Neither GC nor write-back helped, try to commit */
+ if (ri->cmt_retries < MAX_CMT_RETRIES) {
+ ri->cmt_retries += 1;
+ dbg_budg("run commit, retries %d of %d",
+ ri->cmt_retries, MAX_CMT_RETRIES);
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ return -EAGAIN;
+ }
+ return -ENOSPC;
+}
+
+/**
+ * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns the number of eraseblocks which should
+ * be kept for index usage.
+ */
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
+{
+ int ret;
+ uint64_t idx_size;
+
+ idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+
+ /* And make sure we have twice the index size of space reserved */
+ idx_size <<= 1;
+
+ /*
+ * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
+ * pair, nor similarly the two variables for the new index size, so we
+ * have to do this costly 64-bit division on fast-path.
+ */
+ if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+ ret = idx_size + 1;
+ else
+ ret = idx_size;
+ /*
+ * The index head is not available for the in-the-gaps method, so add an
+ * extra LEB to compensate.
+ */
+ ret += 1;
+ /*
+ * At present the index needs at least 2 LEBs: one for the index head
+ * and one for in-the-gaps method (which currently does not cater for
+ * the index head and so excludes it from consideration).
+ */
+ if (ret < 2)
+ ret = 2;
+ return ret;
+}
+
+/**
+ * ubifs_calc_available - calculate available FS space.
+ * @c: UBIFS file-system description object
+ * @min_idx_lebs: minimum number of LEBs reserved for the index
+ *
+ * This function calculates and returns amount of FS space available for use.
+ */
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
+{
+ int subtract_lebs;
+ long long available;
+
+ /*
+ * Force the amount available to the total size reported if the used
+ * space is zero.
+ */
+ if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
+ c->budg_data_growth + c->budg_dd_growth == 0) {
+ /* Do the same calculation as for c->block_cnt */
+ available = c->main_lebs - 2;
+ available *= c->leb_size - c->dark_wm;
+ return available;
+ }
+
+ available = c->main_bytes - c->lst.total_used;
+
+ /*
+ * Now 'available' contains theoretically available flash space
+ * assuming there is no index, so we have to subtract the space which
+ * is reserved for the index.
+ */
+ subtract_lebs = min_idx_lebs;
+
+ /* Take into account that GC reserves one LEB for its own needs */
+ subtract_lebs += 1;
+
+ /*
+ * The GC journal head LEB is not really accessible. And since
+ * different write types go to different heads, we may count only on
+ * one head's space.
+ */
+ subtract_lebs += c->jhead_cnt - 1;
+
+ /* We also reserve one LEB for deletions, which bypass budgeting */
+ subtract_lebs += 1;
+
+ available -= (long long)subtract_lebs * c->leb_size;
+
+ /* Subtract the dead space which is not available for use */
+ available -= c->lst.total_dead;
+
+ /*
+ * Subtract dark space, which might or might not be usable - it depends
+ * on the data which we have on the media and which will be written. If
+ * this is a lot of uncompressed or not-compressible data, the dark
+ * space cannot be used.
+ */
+ available -= c->lst.total_dark;
+
+ /*
+ * However, there is more dark space. The index may be bigger than
+ * @min_idx_lebs. Those extra LEBs are assumed to be available, but
+ * their dark space is not included in total_dark, so it is subtracted
+ * here.
+ */
+ if (c->lst.idx_lebs > min_idx_lebs) {
+ subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
+ available -= subtract_lebs * c->dark_wm;
+ }
+
+ /* The calculations are rough and may end up with a negative number */
+ return available > 0 ? available : 0;
+}
+
+/**
+ * can_use_rp - check whether the user is allowed to use reserved pool.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS has so-called "reserved pool" which is flash space reserved
+ * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
+ * This function checks whether current user is allowed to use reserved pool.
+ * Returns %1 current user is allowed to use reserved pool and %0 otherwise.
+ */
+static int can_use_rp(struct ubifs_info *c)
+{
+ if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
+ (c->rp_gid != 0 && in_group_p(c->rp_gid)))
+ return 1;
+ return 0;
+}
+
+/**
+ * do_budget_space - reserve flash space for index and data growth.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * and data.
+ *
+ * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * would take if it was consolidated and written to the flash. This guarantees
+ * that the "in-the-gaps" commit method always succeeds and UBIFS will always
+ * be able to commit dirty index. So this function basically adds amount of
+ * budgeted index space to the size of the current index, multiplies this by 2,
+ * and makes sure this does not exceed the amount of free eraseblocks.
+ *
+ * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
+ * be large, because UBIFS does not do any index consolidation as long as
+ * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
+ * will contain a lot of dirt.
+ * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ * consolidated to take up to @c->min_idx_lebs LEBs.
+ *
+ * This function returns zero in case of success, and %-ENOSPC in case of
+ * failure.
+ */
+static int do_budget_space(struct ubifs_info *c)
+{
+ long long outstanding, available;
+ int lebs, rsvd_idx_lebs, min_idx_lebs;
+
+ /* First budget index space */
+ min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ /* Now 'min_idx_lebs' contains number of LEBs to reserve */
+ if (min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+ else
+ rsvd_idx_lebs = 0;
+
+ /*
+ * The number of LEBs that are available to be used by the index is:
+ *
+ * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
+ * @c->lst.taken_empty_lebs
+ *
+ * @empty_lebs are available because they are empty. @freeable_cnt are
+ * available because they contain only free and dirty space and the
+ * index allocation always occurs after wbufs are synch'ed.
+ * @idx_gc_cnt are available because they are index LEBs that have been
+ * garbage collected (including trivial GC) and are awaiting the commit
+ * before they can be unmapped - note that the in-the-gaps method will
+ * grab these if it needs them. @taken_empty_lebs are empty_lebs that
+ * have already been allocated for some purpose (also includes those
+ * LEBs on the @idx_gc list).
+ *
+ * Note, @taken_empty_lebs may temporarily be higher by one because of
+ * the way we serialize LEB allocations and budgeting. See a comment in
+ * 'ubifs_find_free_space()'.
+ */
+ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+ c->lst.taken_empty_lebs;
+ if (unlikely(rsvd_idx_lebs > lebs)) {
+ dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
+ "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+ rsvd_idx_lebs);
+ return -ENOSPC;
+ }
+
+ available = ubifs_calc_available(c, min_idx_lebs);
+ outstanding = c->budg_data_growth + c->budg_dd_growth;
+
+ if (unlikely(available < outstanding)) {
+ dbg_budg("out of data space: available %lld, outstanding %lld",
+ available, outstanding);
+ return -ENOSPC;
+ }
+
+ if (available - outstanding <= c->rp_size && !can_use_rp(c))
+ return -ENOSPC;
+
+ c->min_idx_lebs = min_idx_lebs;
+ return 0;
+}
+
+/**
+ * calc_idx_growth - calculate approximate index growth from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ *
+ * For now we assume each new node adds one znode. But this is rather poor
+ * approximation, though.
+ */
+static int calc_idx_growth(const struct ubifs_info *c,
+ const struct ubifs_budget_req *req)
+{
+ int znodes;
+
+ znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
+ req->new_dent;
+ return znodes * c->max_idx_node_sz;
+}
+
+/**
+ * calc_data_growth - calculate approximate amount of new data from budgeting
+ * request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_data_growth(const struct ubifs_info *c,
+ const struct ubifs_budget_req *req)
+{
+ int data_growth;
+
+ data_growth = req->new_ino ? c->inode_budget : 0;
+ if (req->new_page)
+ data_growth += c->page_budget;
+ if (req->new_dent)
+ data_growth += c->dent_budget;
+ data_growth += req->new_ino_d;
+ return data_growth;
+}
+
+/**
+ * calc_dd_growth - calculate approximate amount of data which makes other data
+ * dirty from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_dd_growth(const struct ubifs_info *c,
+ const struct ubifs_budget_req *req)
+{
+ int dd_growth;
+
+ dd_growth = req->dirtied_page ? c->page_budget : 0;
+
+ if (req->dirtied_ino)
+ dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+ if (req->mod_dent)
+ dd_growth += c->dent_budget;
+ dd_growth += req->dirtied_ino_d;
+ return dd_growth;
+}
+
+/**
+ * ubifs_budget_space - ensure there is enough space to complete an operation.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function allocates budget for an operation. It uses pessimistic
+ * approximation of how much flash space the operation needs. The goal of this
+ * function is to make sure UBIFS always has flash space to flush all dirty
+ * pages, dirty inodes, and dirty znodes (liability). This function may force
+ * commit, garbage-collection or write-back. Returns zero in case of success,
+ * %-ENOSPC if there is no free space and other negative error codes in case of
+ * failures.
+ */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+ int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
+ int err, idx_growth, data_growth, dd_growth;
+ struct retries_info ri;
+
+ ubifs_assert(req->dirtied_ino <= 4);
+ ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+
+ data_growth = calc_data_growth(c, req);
+ dd_growth = calc_dd_growth(c, req);
+ if (!data_growth && !dd_growth)
+ return 0;
+ idx_growth = calc_idx_growth(c, req);
+ memset(&ri, 0, sizeof(struct retries_info));
+
+again:
+ spin_lock(&c->space_lock);
+ ubifs_assert(c->budg_idx_growth >= 0);
+ ubifs_assert(c->budg_data_growth >= 0);
+ ubifs_assert(c->budg_dd_growth >= 0);
+
+ if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+ dbg_budg("no space");
+ spin_unlock(&c->space_lock);
+ return -ENOSPC;
+ }
+
+ c->budg_idx_growth += idx_growth;
+ c->budg_data_growth += data_growth;
+ c->budg_dd_growth += dd_growth;
+
+ err = do_budget_space(c);
+ if (likely(!err)) {
+ req->idx_growth = idx_growth;
+ req->data_growth = data_growth;
+ req->dd_growth = dd_growth;
+ spin_unlock(&c->space_lock);
+ return 0;
+ }
+
+ /* Restore the old values */
+ c->budg_idx_growth -= idx_growth;
+ c->budg_data_growth -= data_growth;
+ c->budg_dd_growth -= dd_growth;
+ spin_unlock(&c->space_lock);
+
+ if (req->fast) {
+ dbg_budg("no space for fast budgeting");
+ return err;
+ }
+
+ err = make_free_space(c, &ri);
+ if (err == -EAGAIN) {
+ dbg_budg("try again");
+ cond_resched();
+ goto again;
+ } else if (err == -ENOSPC) {
+ dbg_budg("FS is full, -ENOSPC");
+ c->nospace = 1;
+ if (can_use_rp(c) || c->rp_size == 0)
+ c->nospace_rp = 1;
+ smp_wmb();
+ } else
+ ubifs_err("cannot budget space, error %d", err);
+ return err;
+}
+
+/**
+ * ubifs_release_budget - release budgeted free space.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
+ * since the index changes (which were budgeted for in @req->idx_growth) will
+ * only be written to the media on commit, this function moves the index budget
+ * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * zeroed by the commit operation.
+ */
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+ ubifs_assert(req->dirtied_ino <= 4);
+ ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+ if (!req->recalculate) {
+ ubifs_assert(req->idx_growth >= 0);
+ ubifs_assert(req->data_growth >= 0);
+ ubifs_assert(req->dd_growth >= 0);
+ }
+
+ if (req->recalculate) {
+ req->data_growth = calc_data_growth(c, req);
+ req->dd_growth = calc_dd_growth(c, req);
+ req->idx_growth = calc_idx_growth(c, req);
+ }
+
+ if (!req->data_growth && !req->dd_growth)
+ return;
+
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+
+ spin_lock(&c->space_lock);
+ c->budg_idx_growth -= req->idx_growth;
+ c->budg_uncommitted_idx += req->idx_growth;
+ c->budg_data_growth -= req->data_growth;
+ c->budg_dd_growth -= req->dd_growth;
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ ubifs_assert(c->budg_idx_growth >= 0);
+ ubifs_assert(c->budg_data_growth >= 0);
+ ubifs_assert(c->min_idx_lebs < c->main_lebs);
+ spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_convert_page_budget - convert budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This function converts budget which was allocated for a new page of data to
+ * the budget of changing an existing page of data. The latter is smaller then
+ * the former, so this function only does simple re-calculation and does not
+ * involve any write-back.
+ */
+void ubifs_convert_page_budget(struct ubifs_info *c)
+{
+ spin_lock(&c->space_lock);
+ /* Release the index growth reservation */
+ c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ /* Release the data growth reservation */
+ c->budg_data_growth -= c->page_budget;
+ /* Increase the dirty data growth reservation instead */
+ c->budg_dd_growth += c->page_budget;
+ /* And re-calculate the indexing space reservation */
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_release_dirty_inode_budget - release dirty inode budget.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to release the budget for
+ *
+ * This function releases budget corresponding to a dirty inode. It is usually
+ * called when after the inode has been written to the media and marked as
+ * clean.
+ */
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+ struct ubifs_inode *ui)
+{
+ struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
+ .dirtied_ino_d = ui->data_len};
+
+ ubifs_release_budget(c, &req);
+}
+
+/**
+ * ubifs_budg_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns amount of free space on the file-system.
+ */
+long long ubifs_budg_get_free_space(struct ubifs_info *c)
+{
+ int min_idx_lebs, rsvd_idx_lebs;
+ long long available, outstanding, free;
+
+ /* Do exactly the same calculations as in 'do_budget_space()' */
+ spin_lock(&c->space_lock);
+ min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ if (min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+ else
+ rsvd_idx_lebs = 0;
+
+ if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
+ - c->lst.taken_empty_lebs) {
+ spin_unlock(&c->space_lock);
+ return 0;
+ }
+
+ available = ubifs_calc_available(c, min_idx_lebs);
+ outstanding = c->budg_data_growth + c->budg_dd_growth;
+ c->min_idx_lebs = min_idx_lebs;
+ spin_unlock(&c->space_lock);
+
+ if (available > outstanding)
+ free = ubifs_reported_space(c, available - outstanding);
+ else
+ free = 0;
+ return free;
+}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 00000000000..3b516316c9b
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,677 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions that manage the running of the commit process.
+ * Each affected module has its own functions to accomplish their part in the
+ * commit and those functions are called here.
+ *
+ * The commit is the process whereby all updates to the index and LEB properties
+ * are written out together and the journal becomes empty. This keeps the
+ * file system consistent - at all times the state can be recreated by reading
+ * the index and LEB properties and then replaying the journal.
+ *
+ * The commit is split into two parts named "commit start" and "commit end".
+ * During commit start, the commit process has exclusive access to the journal
+ * by holding the commit semaphore down for writing. As few I/O operations as
+ * possible are performed during commit start, instead the nodes that are to be
+ * written are merely identified. During commit end, the commit semaphore is no
+ * longer held and the journal is again in operation, allowing users to continue
+ * to use the file system while the bulk of the commit I/O is performed. The
+ * purpose of this two-step approach is to prevent the commit from causing any
+ * latency blips. Note that in any case, the commit does not prevent lookups
+ * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
+ * cache.
+ */
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "ubifs.h"
+
+/**
+ * do_commit - commit the journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function implements UBIFS commit. It has to be called with commit lock
+ * locked. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int do_commit(struct ubifs_info *c)
+{
+ int err, new_ltail_lnum, old_ltail_lnum, i;
+ struct ubifs_zbranch zroot;
+ struct ubifs_lp_stats lst;
+
+ dbg_cmt("start");
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_up;
+ }
+
+ /* Sync all write buffers (necessary for recovery) */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ goto out_up;
+ }
+
+ err = ubifs_gc_start_commit(c);
+ if (err)
+ goto out_up;
+ err = dbg_check_lprops(c);
+ if (err)
+ goto out_up;
+ err = ubifs_log_start_commit(c, &new_ltail_lnum);
+ if (err)
+ goto out_up;
+ err = ubifs_tnc_start_commit(c, &zroot);
+ if (err)
+ goto out_up;
+ err = ubifs_lpt_start_commit(c);
+ if (err)
+ goto out_up;
+ err = ubifs_orphan_start_commit(c);
+ if (err)
+ goto out_up;
+
+ ubifs_get_lp_stats(c, &lst);
+
+ up_write(&c->commit_sem);
+
+ err = ubifs_tnc_end_commit(c);
+ if (err)
+ goto out;
+ err = ubifs_lpt_end_commit(c);
+ if (err)
+ goto out;
+ err = ubifs_orphan_end_commit(c);
+ if (err)
+ goto out;
+ old_ltail_lnum = c->ltail_lnum;
+ err = ubifs_log_end_commit(c, new_ltail_lnum);
+ if (err)
+ goto out;
+ err = dbg_check_old_index(c, &zroot);
+ if (err)
+ goto out;
+
+ mutex_lock(&c->mst_mutex);
+ c->mst_node->cmt_no = cpu_to_le64(++c->cmt_no);
+ c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
+ c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
+ c->mst_node->root_offs = cpu_to_le32(zroot.offs);
+ c->mst_node->root_len = cpu_to_le32(zroot.len);
+ c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
+ c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
+ c->mst_node->index_size = cpu_to_le64(c->old_idx_sz);
+ c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
+ c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
+ c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
+ c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs);
+ c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum);
+ c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs);
+ c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum);
+ c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs);
+ c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum);
+ c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs);
+ c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs);
+ c->mst_node->total_free = cpu_to_le64(lst.total_free);
+ c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
+ c->mst_node->total_used = cpu_to_le64(lst.total_used);
+ c->mst_node->total_dead = cpu_to_le64(lst.total_dead);
+ c->mst_node->total_dark = cpu_to_le64(lst.total_dark);
+ if (c->no_orphs)
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ else
+ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ err = ubifs_write_master(c);
+ mutex_unlock(&c->mst_mutex);
+ if (err)
+ goto out;
+
+ err = ubifs_log_post_commit(c, old_ltail_lnum);
+ if (err)
+ goto out;
+ err = ubifs_gc_end_commit(c);
+ if (err)
+ goto out;
+ err = ubifs_lpt_post_commit(c);
+ if (err)
+ goto out;
+
+ spin_lock(&c->cs_lock);
+ c->cmt_state = COMMIT_RESTING;
+ wake_up(&c->cmt_wq);
+ dbg_cmt("commit end");
+ spin_unlock(&c->cs_lock);
+
+ return 0;
+
+out_up:
+ up_write(&c->commit_sem);
+out:
+ ubifs_err("commit failed, error %d", err);
+ spin_lock(&c->cs_lock);
+ c->cmt_state = COMMIT_BROKEN;
+ wake_up(&c->cmt_wq);
+ spin_unlock(&c->cs_lock);
+ ubifs_ro_mode(c, err);
+ return err;
+}
+
+/**
+ * run_bg_commit - run background commit if it is needed.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs background commit if it is needed. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int run_bg_commit(struct ubifs_info *c)
+{
+ spin_lock(&c->cs_lock);
+ /*
+ * Run background commit only if background commit was requested or if
+ * commit is required.
+ */
+ if (c->cmt_state != COMMIT_BACKGROUND &&
+ c->cmt_state != COMMIT_REQUIRED)
+ goto out;
+ spin_unlock(&c->cs_lock);
+
+ down_write(&c->commit_sem);
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_REQUIRED)
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+ else if (c->cmt_state == COMMIT_BACKGROUND)
+ c->cmt_state = COMMIT_RUNNING_BACKGROUND;
+ else
+ goto out_cmt_unlock;
+ spin_unlock(&c->cs_lock);
+
+ return do_commit(c);
+
+out_cmt_unlock:
+ up_write(&c->commit_sem);
+out:
+ spin_unlock(&c->cs_lock);
+ return 0;
+}
+
+/**
+ * ubifs_bg_thread - UBIFS background thread function.
+ * @info: points to the file-system description object
+ *
+ * This function implements various file-system background activities:
+ * o when a write-buffer timer expires it synchronizes the appropriate
+ * write-buffer;
+ * o when the journal is about to be full, it starts in-advance commit.
+ *
+ * Note, other stuff like background garbage collection may be added here in
+ * future.
+ */
+int ubifs_bg_thread(void *info)
+{
+ int err;
+ struct ubifs_info *c = info;
+
+ ubifs_msg("background thread \"%s\" started, PID %d",
+ c->bgt_name, current->pid);
+ set_freezable();
+
+ while (1) {
+ if (kthread_should_stop())
+ break;
+
+ if (try_to_freeze())
+ continue;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Check if there is something to do */
+ if (!c->need_bgt) {
+ /*
+ * Nothing prevents us from going sleep now and
+ * be never woken up and block the task which
+ * could wait in 'kthread_stop()' forever.
+ */
+ if (kthread_should_stop())
+ break;
+ schedule();
+ continue;
+ } else
+ __set_current_state(TASK_RUNNING);
+
+ c->need_bgt = 0;
+ err = ubifs_bg_wbufs_sync(c);
+ if (err)
+ ubifs_ro_mode(c, err);
+
+ run_bg_commit(c);
+ cond_resched();
+ }
+
+ dbg_msg("background thread \"%s\" stops", c->bgt_name);
+ return 0;
+}
+
+/**
+ * ubifs_commit_required - set commit state to "required".
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if a commit is required but cannot be done from the
+ * calling function, so it is just flagged instead.
+ */
+void ubifs_commit_required(struct ubifs_info *c)
+{
+ spin_lock(&c->cs_lock);
+ switch (c->cmt_state) {
+ case COMMIT_RESTING:
+ case COMMIT_BACKGROUND:
+ dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+ dbg_cstate(COMMIT_REQUIRED));
+ c->cmt_state = COMMIT_REQUIRED;
+ break;
+ case COMMIT_RUNNING_BACKGROUND:
+ dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+ dbg_cstate(COMMIT_RUNNING_REQUIRED));
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+ break;
+ case COMMIT_REQUIRED:
+ case COMMIT_RUNNING_REQUIRED:
+ case COMMIT_BROKEN:
+ break;
+ }
+ spin_unlock(&c->cs_lock);
+}
+
+/**
+ * ubifs_request_bg_commit - notify the background thread to do a commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if the journal is full enough to make a commit
+ * worthwhile, so background thread is kicked to start it.
+ */
+void ubifs_request_bg_commit(struct ubifs_info *c)
+{
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_RESTING) {
+ dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+ dbg_cstate(COMMIT_BACKGROUND));
+ c->cmt_state = COMMIT_BACKGROUND;
+ spin_unlock(&c->cs_lock);
+ ubifs_wake_up_bgt(c);
+ } else
+ spin_unlock(&c->cs_lock);
+}
+
+/**
+ * wait_for_commit - wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function sleeps until the commit operation is no longer running.
+ */
+static int wait_for_commit(struct ubifs_info *c)
+{
+ dbg_cmt("pid %d goes sleep", current->pid);
+
+ /*
+ * The following sleeps if the condition is false, and will be woken
+ * when the commit ends. It is possible, although very unlikely, that we
+ * will wake up and see the subsequent commit running, rather than the
+ * one we were waiting for, and go back to sleep. However, we will be
+ * woken again, so there is no danger of sleeping forever.
+ */
+ wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
+ c->cmt_state != COMMIT_RUNNING_REQUIRED);
+ dbg_cmt("commit finished, pid %d woke up", current->pid);
+ return 0;
+}
+
+/**
+ * ubifs_run_commit - run or wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs commit and returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_run_commit(struct ubifs_info *c)
+{
+ int err = 0;
+
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_BROKEN) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+ /*
+ * We set the commit state to 'running required' to indicate
+ * that we want it to complete as quickly as possible.
+ */
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+ if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+ spin_unlock(&c->cs_lock);
+ return wait_for_commit(c);
+ }
+ spin_unlock(&c->cs_lock);
+
+ /* Ok, the commit is indeed needed */
+
+ down_write(&c->commit_sem);
+ spin_lock(&c->cs_lock);
+ /*
+ * Since we unlocked 'c->cs_lock', the state may have changed, so
+ * re-check it.
+ */
+ if (c->cmt_state == COMMIT_BROKEN) {
+ err = -EINVAL;
+ goto out_cmt_unlock;
+ }
+
+ if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+ if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+ up_write(&c->commit_sem);
+ spin_unlock(&c->cs_lock);
+ return wait_for_commit(c);
+ }
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+ spin_unlock(&c->cs_lock);
+
+ err = do_commit(c);
+ return err;
+
+out_cmt_unlock:
+ up_write(&c->commit_sem);
+out:
+ spin_unlock(&c->cs_lock);
+ return err;
+}
+
+/**
+ * ubifs_gc_should_commit - determine if it is time for GC to run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by garbage collection to determine if commit should
+ * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
+ * is full enough to start commit, this function returns true. It is not
+ * absolutely necessary to commit yet, but it feels like this should be better
+ * then to keep doing GC. This function returns %1 if GC has to initiate commit
+ * and %0 if not.
+ */
+int ubifs_gc_should_commit(struct ubifs_info *c)
+{
+ int ret = 0;
+
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_BACKGROUND) {
+ dbg_cmt("commit required now");
+ c->cmt_state = COMMIT_REQUIRED;
+ } else
+ dbg_cmt("commit not requested");
+ if (c->cmt_state == COMMIT_REQUIRED)
+ ret = 1;
+ spin_unlock(&c->cs_lock);
+ return ret;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * struct idx_node - hold index nodes during index tree traversal.
+ * @list: list
+ * @iip: index in parent (slot number of this indexing node in the parent
+ * indexing node)
+ * @upper_key: all keys in this indexing node have to be less or equivalent to
+ * this key
+ * @idx: index node (8-byte aligned because all node structures must be 8-byte
+ * aligned)
+ */
+struct idx_node {
+ struct list_head list;
+ int iip;
+ union ubifs_key upper_key;
+ struct ubifs_idx_node idx __attribute__((aligned(8)));
+};
+
+/**
+ * dbg_old_index_check_init - get information for the next old index check.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the index
+ *
+ * This function records information about the index that will be needed for the
+ * next old index check i.e. 'dbg_check_old_index()'.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+ struct ubifs_idx_node *idx;
+ int lnum, offs, len, err = 0;
+
+ c->old_zroot = *zroot;
+
+ lnum = c->old_zroot.lnum;
+ offs = c->old_zroot.offs;
+ len = c->old_zroot.len;
+
+ idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+ if (!idx)
+ return -ENOMEM;
+
+ err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+ if (err)
+ goto out;
+
+ c->old_zroot_level = le16_to_cpu(idx->level);
+ c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+out:
+ kfree(idx);
+ return err;
+}
+
+/**
+ * dbg_check_old_index - check the old copy of the index.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the new index
+ *
+ * In order to be able to recover from an unclean unmount, a complete copy of
+ * the index must exist on flash. This is the "old" index. The commit process
+ * must write the "new" index to flash without overwriting or destroying any
+ * part of the old index. This function is run at commit end in order to check
+ * that the old index does indeed exist completely intact.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+ int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+ int first = 1, iip;
+ union ubifs_key lower_key, upper_key, l_key, u_key;
+ unsigned long long uninitialized_var(last_sqnum);
+ struct ubifs_idx_node *idx;
+ struct list_head list;
+ struct idx_node *i;
+ size_t sz;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
+ goto out;
+
+ INIT_LIST_HEAD(&list);
+
+ sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
+ UBIFS_IDX_NODE_SZ;
+
+ /* Start at the old zroot */
+ lnum = c->old_zroot.lnum;
+ offs = c->old_zroot.offs;
+ len = c->old_zroot.len;
+ iip = 0;
+
+ /*
+ * Traverse the index tree preorder depth-first i.e. do a node and then
+ * its subtrees from left to right.
+ */
+ while (1) {
+ struct ubifs_branch *br;
+
+ /* Get the next index node */
+ i = kmalloc(sz, GFP_NOFS);
+ if (!i) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ i->iip = iip;
+ /* Keep the index nodes on our path in a linked list */
+ list_add_tail(&i->list, &list);
+ /* Read the index node */
+ idx = &i->idx;
+ err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+ if (err)
+ goto out_free;
+ /* Validate index node */
+ child_cnt = le16_to_cpu(idx->child_cnt);
+ if (child_cnt < 1 || child_cnt > c->fanout) {
+ err = 1;
+ goto out_dump;
+ }
+ if (first) {
+ first = 0;
+ /* Check root level and sqnum */
+ if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+ err = 2;
+ goto out_dump;
+ }
+ if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+ err = 3;
+ goto out_dump;
+ }
+ /* Set last values as though root had a parent */
+ last_level = le16_to_cpu(idx->level) + 1;
+ last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
+ key_read(c, ubifs_idx_key(c, idx), &lower_key);
+ highest_ino_key(c, &upper_key, INUM_WATERMARK);
+ }
+ key_copy(c, &upper_key, &i->upper_key);
+ if (le16_to_cpu(idx->level) != last_level - 1) {
+ err = 3;
+ goto out_dump;
+ }
+ /*
+ * The index is always written bottom up hence a child's sqnum
+ * is always less than the parents.
+ */
+ if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
+ err = 4;
+ goto out_dump;
+ }
+ /* Check key range */
+ key_read(c, ubifs_idx_key(c, idx), &l_key);
+ br = ubifs_idx_branch(c, idx, child_cnt - 1);
+ key_read(c, &br->key, &u_key);
+ if (keys_cmp(c, &lower_key, &l_key) > 0) {
+ err = 5;
+ goto out_dump;
+ }
+ if (keys_cmp(c, &upper_key, &u_key) < 0) {
+ err = 6;
+ goto out_dump;
+ }
+ if (keys_cmp(c, &upper_key, &u_key) == 0)
+ if (!is_hash_key(c, &u_key)) {
+ err = 7;
+ goto out_dump;
+ }
+ /* Go to next index node */
+ if (le16_to_cpu(idx->level) == 0) {
+ /* At the bottom, so go up until can go right */
+ while (1) {
+ /* Drop the bottom of the list */
+ list_del(&i->list);
+ kfree(i);
+ /* No more list means we are done */
+ if (list_empty(&list))
+ goto out;
+ /* Look at the new bottom */
+ i = list_entry(list.prev, struct idx_node,
+ list);
+ idx = &i->idx;
+ /* Can we go right */
+ if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+ iip = iip + 1;
+ break;
+ } else
+ /* Nope, so go up again */
+ iip = i->iip;
+ }
+ } else
+ /* Go down left */
+ iip = 0;
+ /*
+ * We have the parent in 'idx' and now we set up for reading the
+ * child pointed to by slot 'iip'.
+ */
+ last_level = le16_to_cpu(idx->level);
+ last_sqnum = le64_to_cpu(idx->ch.sqnum);
+ br = ubifs_idx_branch(c, idx, iip);
+ lnum = le32_to_cpu(br->lnum);
+ offs = le32_to_cpu(br->offs);
+ len = le32_to_cpu(br->len);
+ key_read(c, &br->key, &lower_key);
+ if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+ br = ubifs_idx_branch(c, idx, iip + 1);
+ key_read(c, &br->key, &upper_key);
+ } else
+ key_copy(c, &i->upper_key, &upper_key);
+ }
+out:
+ err = dbg_old_index_check_init(c, zroot);
+ if (err)
+ goto out_free;
+
+ return 0;
+
+out_dump:
+ dbg_err("dumping index node (iip=%d)", i->iip);
+ dbg_dump_node(c, idx);
+ list_del(&i->list);
+ kfree(i);
+ if (!list_empty(&list)) {
+ i = list_entry(list.prev, struct idx_node, list);
+ dbg_err("dumping parent index node");
+ dbg_dump_node(c, &i->idx);
+ }
+out_free:
+ while (!list_empty(&list)) {
+ i = list_entry(list.next, struct idx_node, list);
+ list_del(&i->list);
+ kfree(i);
+ }
+ ubifs_err("failed, error %d", err);
+ if (err > 0)
+ err = -EINVAL;
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 00000000000..5bb51dac3c1
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,253 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ * Zoltan Sogor
+ */
+
+/*
+ * This file provides a single place to access to compression and
+ * decompression.
+ */
+
+#include <linux/crypto.h>
+#include "ubifs.h"
+
+/* Fake description object for the "none" compressor */
+static struct ubifs_compressor none_compr = {
+ .compr_type = UBIFS_COMPR_NONE,
+ .name = "no compression",
+ .capi_name = "",
+};
+
+#ifdef CONFIG_UBIFS_FS_LZO
+static DEFINE_MUTEX(lzo_mutex);
+
+static struct ubifs_compressor lzo_compr = {
+ .compr_type = UBIFS_COMPR_LZO,
+ .comp_mutex = &lzo_mutex,
+ .name = "LZO",
+ .capi_name = "lzo",
+};
+#else
+static struct ubifs_compressor lzo_compr = {
+ .compr_type = UBIFS_COMPR_LZO,
+ .name = "LZO",
+};
+#endif
+
+#ifdef CONFIG_UBIFS_FS_ZLIB
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+
+static struct ubifs_compressor zlib_compr = {
+ .compr_type = UBIFS_COMPR_ZLIB,
+ .comp_mutex = &deflate_mutex,
+ .decomp_mutex = &inflate_mutex,
+ .name = "zlib",
+ .capi_name = "deflate",
+};
+#else
+static struct ubifs_compressor zlib_compr = {
+ .compr_type = UBIFS_COMPR_ZLIB,
+ .name = "zlib",
+};
+#endif
+
+/* All UBIFS compressors */
+struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/**
+ * ubifs_compress - compress data.
+ * @in_buf: data to compress
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ * type on exit
+ *
+ * This function compresses input buffer @in_buf of length @in_len and stores
+ * the result in the output buffer @out_buf and the resulting length in
+ * @out_len. If the input buffer does not compress, it is just copied to the
+ * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
+ * compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+ int *compr_type)
+{
+ int err;
+ struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+
+ if (*compr_type == UBIFS_COMPR_NONE)
+ goto no_compr;
+
+ /* If the input data is small, do not even try to compress it */
+ if (in_len < UBIFS_MIN_COMPR_LEN)
+ goto no_compr;
+
+ if (compr->comp_mutex)
+ mutex_lock(compr->comp_mutex);
+ err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
+ out_len);
+ if (compr->comp_mutex)
+ mutex_unlock(compr->comp_mutex);
+ if (unlikely(err)) {
+ ubifs_warn("cannot compress %d bytes, compressor %s, "
+ "error %d, leave data uncompressed",
+ in_len, compr->name, err);
+ goto no_compr;
+ }
+
+ /*
+ * Presently, we just require that compression results in less data,
+ * rather than any defined minimum compression ratio or amount.
+ */
+ if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+ goto no_compr;
+
+ return;
+
+no_compr:
+ memcpy(out_buf, in_buf, in_len);
+ *out_len = in_len;
+ *compr_type = UBIFS_COMPR_NONE;
+}
+
+/**
+ * ubifs_decompress - decompress data.
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
+ int *out_len, int compr_type)
+{
+ int err;
+ struct ubifs_compressor *compr;
+
+ if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
+ ubifs_err("invalid compression type %d", compr_type);
+ return -EINVAL;
+ }
+
+ compr = ubifs_compressors[compr_type];
+
+ if (unlikely(!compr->capi_name)) {
+ ubifs_err("%s compression is not compiled in", compr->name);
+ return -EINVAL;
+ }
+
+ if (compr_type == UBIFS_COMPR_NONE) {
+ memcpy(out_buf, in_buf, in_len);
+ *out_len = in_len;
+ return 0;
+ }
+
+ if (compr->decomp_mutex)
+ mutex_lock(compr->decomp_mutex);
+ err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
+ out_len);
+ if (compr->decomp_mutex)
+ mutex_unlock(compr->decomp_mutex);
+ if (err)
+ ubifs_err("cannot decompress %d bytes, compressor %s, "
+ "error %d", in_len, compr->name, err);
+
+ return err;
+}
+
+/**
+ * compr_init - initialize a compressor.
+ * @compr: compressor description object
+ *
+ * This function initializes the requested compressor and returns zero in case
+ * of success or a negative error code in case of failure.
+ */
+static int __init compr_init(struct ubifs_compressor *compr)
+{
+ if (compr->capi_name) {
+ compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+ if (IS_ERR(compr->cc)) {
+ ubifs_err("cannot initialize compressor %s, error %ld",
+ compr->name, PTR_ERR(compr->cc));
+ return PTR_ERR(compr->cc);
+ }
+ }
+
+ ubifs_compressors[compr->compr_type] = compr;
+ return 0;
+}
+
+/**
+ * compr_exit - de-initialize a compressor.
+ * @compr: compressor description object
+ */
+static void compr_exit(struct ubifs_compressor *compr)
+{
+ if (compr->capi_name)
+ crypto_free_comp(compr->cc);
+ return;
+}
+
+/**
+ * ubifs_compressors_init - initialize UBIFS compressors.
+ *
+ * This function initializes the compressor which were compiled in. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int __init ubifs_compressors_init(void)
+{
+ int err;
+
+ err = compr_init(&lzo_compr);
+ if (err)
+ return err;
+
+ err = compr_init(&zlib_compr);
+ if (err)
+ goto out_lzo;
+
+ ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
+ return 0;
+
+out_lzo:
+ compr_exit(&lzo_compr);
+ return err;
+}
+
+/**
+ * ubifs_compressors_exit - de-initialize UBIFS compressors.
+ */
+void __exit ubifs_compressors_exit(void)
+{
+ compr_exit(&lzo_compr);
+ compr_exit(&zlib_compr);
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 00000000000..4e3aaeba4ec
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2289 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements most of the debugging stuff which is compiled in only
+ * when it is enabled. But some debugging check functions are implemented in
+ * corresponding subsystem, just because they are closely related and utilize
+ * various local functions of those subsystems.
+ */
+
+#define UBIFS_DBG_PRESERVE_UBI
+
+#include "ubifs.h"
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+DEFINE_SPINLOCK(dbg_lock);
+
+static char dbg_key_buf0[128];
+static char dbg_key_buf1[128];
+
+unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
+unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_tst_flags;
+
+module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
+
+MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
+MODULE_PARM_DESC(debug_chks, "Debug check flags");
+MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
+
+static const char *get_key_fmt(int fmt)
+{
+ switch (fmt) {
+ case UBIFS_SIMPLE_KEY_FMT:
+ return "simple";
+ default:
+ return "unknown/invalid format";
+ }
+}
+
+static const char *get_key_hash(int hash)
+{
+ switch (hash) {
+ case UBIFS_KEY_HASH_R5:
+ return "R5";
+ case UBIFS_KEY_HASH_TEST:
+ return "test";
+ default:
+ return "unknown/invalid name hash";
+ }
+}
+
+static const char *get_key_type(int type)
+{
+ switch (type) {
+ case UBIFS_INO_KEY:
+ return "inode";
+ case UBIFS_DENT_KEY:
+ return "direntry";
+ case UBIFS_XENT_KEY:
+ return "xentry";
+ case UBIFS_DATA_KEY:
+ return "data";
+ case UBIFS_TRUN_KEY:
+ return "truncate";
+ default:
+ return "unknown/invalid key";
+ }
+}
+
+static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+ char *buffer)
+{
+ char *p = buffer;
+ int type = key_type(c, key);
+
+ if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
+ switch (type) {
+ case UBIFS_INO_KEY:
+ sprintf(p, "(%lu, %s)", key_inum(c, key),
+ get_key_type(type));
+ break;
+ case UBIFS_DENT_KEY:
+ case UBIFS_XENT_KEY:
+ sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
+ get_key_type(type), key_hash(c, key));
+ break;
+ case UBIFS_DATA_KEY:
+ sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
+ get_key_type(type), key_block(c, key));
+ break;
+ case UBIFS_TRUN_KEY:
+ sprintf(p, "(%lu, %s)",
+ key_inum(c, key), get_key_type(type));
+ break;
+ default:
+ sprintf(p, "(bad key type: %#08x, %#08x)",
+ key->u32[0], key->u32[1]);
+ }
+ } else
+ sprintf(p, "bad key format %d", c->key_fmt);
+}
+
+const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
+{
+ /* dbg_lock must be held */
+ sprintf_key(c, key, dbg_key_buf0);
+ return dbg_key_buf0;
+}
+
+const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
+{
+ /* dbg_lock must be held */
+ sprintf_key(c, key, dbg_key_buf1);
+ return dbg_key_buf1;
+}
+
+const char *dbg_ntype(int type)
+{
+ switch (type) {
+ case UBIFS_PAD_NODE:
+ return "padding node";
+ case UBIFS_SB_NODE:
+ return "superblock node";
+ case UBIFS_MST_NODE:
+ return "master node";
+ case UBIFS_REF_NODE:
+ return "reference node";
+ case UBIFS_INO_NODE:
+ return "inode node";
+ case UBIFS_DENT_NODE:
+ return "direntry node";
+ case UBIFS_XENT_NODE:
+ return "xentry node";
+ case UBIFS_DATA_NODE:
+ return "data node";
+ case UBIFS_TRUN_NODE:
+ return "truncate node";
+ case UBIFS_IDX_NODE:
+ return "indexing node";
+ case UBIFS_CS_NODE:
+ return "commit start node";
+ case UBIFS_ORPH_NODE:
+ return "orphan node";
+ default:
+ return "unknown node";
+ }
+}
+
+static const char *dbg_gtype(int type)
+{
+ switch (type) {
+ case UBIFS_NO_NODE_GROUP:
+ return "no node group";
+ case UBIFS_IN_NODE_GROUP:
+ return "in node group";
+ case UBIFS_LAST_OF_NODE_GROUP:
+ return "last of node group";
+ default:
+ return "unknown";
+ }
+}
+
+const char *dbg_cstate(int cmt_state)
+{
+ switch (cmt_state) {
+ case COMMIT_RESTING:
+ return "commit resting";
+ case COMMIT_BACKGROUND:
+ return "background commit requested";
+ case COMMIT_REQUIRED:
+ return "commit required";
+ case COMMIT_RUNNING_BACKGROUND:
+ return "BACKGROUND commit running";
+ case COMMIT_RUNNING_REQUIRED:
+ return "commit running and required";
+ case COMMIT_BROKEN:
+ return "broken commit";
+ default:
+ return "unknown commit state";
+ }
+}
+
+static void dump_ch(const struct ubifs_ch *ch)
+{
+ printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
+ printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc));
+ printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type,
+ dbg_ntype(ch->node_type));
+ printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type,
+ dbg_gtype(ch->group_type));
+ printk(KERN_DEBUG "\tsqnum %llu\n",
+ (unsigned long long)le64_to_cpu(ch->sqnum));
+ printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len));
+}
+
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
+{
+ const struct ubifs_inode *ui = ubifs_inode(inode);
+
+ printk(KERN_DEBUG "inode %lu\n", inode->i_ino);
+ printk(KERN_DEBUG "size %llu\n",
+ (unsigned long long)i_size_read(inode));
+ printk(KERN_DEBUG "nlink %u\n", inode->i_nlink);
+ printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid);
+ printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid);
+ printk(KERN_DEBUG "atime %u.%u\n",
+ (unsigned int)inode->i_atime.tv_sec,
+ (unsigned int)inode->i_atime.tv_nsec);
+ printk(KERN_DEBUG "mtime %u.%u\n",
+ (unsigned int)inode->i_mtime.tv_sec,
+ (unsigned int)inode->i_mtime.tv_nsec);
+ printk(KERN_DEBUG "ctime %u.%u\n",
+ (unsigned int)inode->i_ctime.tv_sec,
+ (unsigned int)inode->i_ctime.tv_nsec);
+ printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
+ printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size);
+ printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt);
+ printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
+ printk(KERN_DEBUG "dirty %u\n", ui->dirty);
+ printk(KERN_DEBUG "xattr %u\n", ui->xattr);
+ printk(KERN_DEBUG "flags %d\n", ui->flags);
+ printk(KERN_DEBUG "compr_type %d\n", ui->compr_type);
+ printk(KERN_DEBUG "data_len %d\n", ui->data_len);
+}
+
+void dbg_dump_node(const struct ubifs_info *c, const void *node)
+{
+ int i, n;
+ union ubifs_key key;
+ const struct ubifs_ch *ch = node;
+
+ if (dbg_failure_mode)
+ return;
+
+ /* If the magic is incorrect, just hexdump the first bytes */
+ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
+ printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+ (void *)node, UBIFS_CH_SZ, 1);
+ return;
+ }
+
+ spin_lock(&dbg_lock);
+ dump_ch(node);
+
+ switch (ch->node_type) {
+ case UBIFS_PAD_NODE:
+ {
+ const struct ubifs_pad_node *pad = node;
+
+ printk(KERN_DEBUG "\tpad_len %u\n",
+ le32_to_cpu(pad->pad_len));
+ break;
+ }
+ case UBIFS_SB_NODE:
+ {
+ const struct ubifs_sb_node *sup = node;
+ unsigned int sup_flags = le32_to_cpu(sup->flags);
+
+ printk(KERN_DEBUG "\tkey_hash %d (%s)\n",
+ (int)sup->key_hash, get_key_hash(sup->key_hash));
+ printk(KERN_DEBUG "\tkey_fmt %d (%s)\n",
+ (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
+ printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
+ printk(KERN_DEBUG "\t big_lpt %u\n",
+ !!(sup_flags & UBIFS_FLG_BIGLPT));
+ printk(KERN_DEBUG "\tmin_io_size %u\n",
+ le32_to_cpu(sup->min_io_size));
+ printk(KERN_DEBUG "\tleb_size %u\n",
+ le32_to_cpu(sup->leb_size));
+ printk(KERN_DEBUG "\tleb_cnt %u\n",
+ le32_to_cpu(sup->leb_cnt));
+ printk(KERN_DEBUG "\tmax_leb_cnt %u\n",
+ le32_to_cpu(sup->max_leb_cnt));
+ printk(KERN_DEBUG "\tmax_bud_bytes %llu\n",
+ (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
+ printk(KERN_DEBUG "\tlog_lebs %u\n",
+ le32_to_cpu(sup->log_lebs));
+ printk(KERN_DEBUG "\tlpt_lebs %u\n",
+ le32_to_cpu(sup->lpt_lebs));
+ printk(KERN_DEBUG "\torph_lebs %u\n",
+ le32_to_cpu(sup->orph_lebs));
+ printk(KERN_DEBUG "\tjhead_cnt %u\n",
+ le32_to_cpu(sup->jhead_cnt));
+ printk(KERN_DEBUG "\tfanout %u\n",
+ le32_to_cpu(sup->fanout));
+ printk(KERN_DEBUG "\tlsave_cnt %u\n",
+ le32_to_cpu(sup->lsave_cnt));
+ printk(KERN_DEBUG "\tdefault_compr %u\n",
+ (int)le16_to_cpu(sup->default_compr));
+ printk(KERN_DEBUG "\trp_size %llu\n",
+ (unsigned long long)le64_to_cpu(sup->rp_size));
+ printk(KERN_DEBUG "\trp_uid %u\n",
+ le32_to_cpu(sup->rp_uid));
+ printk(KERN_DEBUG "\trp_gid %u\n",
+ le32_to_cpu(sup->rp_gid));
+ printk(KERN_DEBUG "\tfmt_version %u\n",
+ le32_to_cpu(sup->fmt_version));
+ printk(KERN_DEBUG "\ttime_gran %u\n",
+ le32_to_cpu(sup->time_gran));
+ printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X"
+ "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+ sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
+ sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
+ sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
+ sup->uuid[12], sup->uuid[13], sup->uuid[14],
+ sup->uuid[15]);
+ break;
+ }
+ case UBIFS_MST_NODE:
+ {
+ const struct ubifs_mst_node *mst = node;
+
+ printk(KERN_DEBUG "\thighest_inum %llu\n",
+ (unsigned long long)le64_to_cpu(mst->highest_inum));
+ printk(KERN_DEBUG "\tcommit number %llu\n",
+ (unsigned long long)le64_to_cpu(mst->cmt_no));
+ printk(KERN_DEBUG "\tflags %#x\n",
+ le32_to_cpu(mst->flags));
+ printk(KERN_DEBUG "\tlog_lnum %u\n",
+ le32_to_cpu(mst->log_lnum));
+ printk(KERN_DEBUG "\troot_lnum %u\n",
+ le32_to_cpu(mst->root_lnum));
+ printk(KERN_DEBUG "\troot_offs %u\n",
+ le32_to_cpu(mst->root_offs));
+ printk(KERN_DEBUG "\troot_len %u\n",
+ le32_to_cpu(mst->root_len));
+ printk(KERN_DEBUG "\tgc_lnum %u\n",
+ le32_to_cpu(mst->gc_lnum));
+ printk(KERN_DEBUG "\tihead_lnum %u\n",
+ le32_to_cpu(mst->ihead_lnum));
+ printk(KERN_DEBUG "\tihead_offs %u\n",
+ le32_to_cpu(mst->ihead_offs));
+ printk(KERN_DEBUG "\tindex_size %u\n",
+ le32_to_cpu(mst->index_size));
+ printk(KERN_DEBUG "\tlpt_lnum %u\n",
+ le32_to_cpu(mst->lpt_lnum));
+ printk(KERN_DEBUG "\tlpt_offs %u\n",
+ le32_to_cpu(mst->lpt_offs));
+ printk(KERN_DEBUG "\tnhead_lnum %u\n",
+ le32_to_cpu(mst->nhead_lnum));
+ printk(KERN_DEBUG "\tnhead_offs %u\n",
+ le32_to_cpu(mst->nhead_offs));
+ printk(KERN_DEBUG "\tltab_lnum %u\n",
+ le32_to_cpu(mst->ltab_lnum));
+ printk(KERN_DEBUG "\tltab_offs %u\n",
+ le32_to_cpu(mst->ltab_offs));
+ printk(KERN_DEBUG "\tlsave_lnum %u\n",
+ le32_to_cpu(mst->lsave_lnum));
+ printk(KERN_DEBUG "\tlsave_offs %u\n",
+ le32_to_cpu(mst->lsave_offs));
+ printk(KERN_DEBUG "\tlscan_lnum %u\n",
+ le32_to_cpu(mst->lscan_lnum));
+ printk(KERN_DEBUG "\tleb_cnt %u\n",
+ le32_to_cpu(mst->leb_cnt));
+ printk(KERN_DEBUG "\tempty_lebs %u\n",
+ le32_to_cpu(mst->empty_lebs));
+ printk(KERN_DEBUG "\tidx_lebs %u\n",
+ le32_to_cpu(mst->idx_lebs));
+ printk(KERN_DEBUG "\ttotal_free %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_free));
+ printk(KERN_DEBUG "\ttotal_dirty %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_dirty));
+ printk(KERN_DEBUG "\ttotal_used %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_used));
+ printk(KERN_DEBUG "\ttotal_dead %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_dead));
+ printk(KERN_DEBUG "\ttotal_dark %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_dark));
+ break;
+ }
+ case UBIFS_REF_NODE:
+ {
+ const struct ubifs_ref_node *ref = node;
+
+ printk(KERN_DEBUG "\tlnum %u\n",
+ le32_to_cpu(ref->lnum));
+ printk(KERN_DEBUG "\toffs %u\n",
+ le32_to_cpu(ref->offs));
+ printk(KERN_DEBUG "\tjhead %u\n",
+ le32_to_cpu(ref->jhead));
+ break;
+ }
+ case UBIFS_INO_NODE:
+ {
+ const struct ubifs_ino_node *ino = node;
+
+ key_read(c, &ino->key, &key);
+ printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
+ (unsigned long long)le64_to_cpu(ino->creat_sqnum));
+ printk(KERN_DEBUG "\tsize %llu\n",
+ (unsigned long long)le64_to_cpu(ino->size));
+ printk(KERN_DEBUG "\tnlink %u\n",
+ le32_to_cpu(ino->nlink));
+ printk(KERN_DEBUG "\tatime %lld.%u\n",
+ (long long)le64_to_cpu(ino->atime_sec),
+ le32_to_cpu(ino->atime_nsec));
+ printk(KERN_DEBUG "\tmtime %lld.%u\n",
+ (long long)le64_to_cpu(ino->mtime_sec),
+ le32_to_cpu(ino->mtime_nsec));
+ printk(KERN_DEBUG "\tctime %lld.%u\n",
+ (long long)le64_to_cpu(ino->ctime_sec),
+ le32_to_cpu(ino->ctime_nsec));
+ printk(KERN_DEBUG "\tuid %u\n",
+ le32_to_cpu(ino->uid));
+ printk(KERN_DEBUG "\tgid %u\n",
+ le32_to_cpu(ino->gid));
+ printk(KERN_DEBUG "\tmode %u\n",
+ le32_to_cpu(ino->mode));
+ printk(KERN_DEBUG "\tflags %#x\n",
+ le32_to_cpu(ino->flags));
+ printk(KERN_DEBUG "\txattr_cnt %u\n",
+ le32_to_cpu(ino->xattr_cnt));
+ printk(KERN_DEBUG "\txattr_size %u\n",
+ le32_to_cpu(ino->xattr_size));
+ printk(KERN_DEBUG "\txattr_names %u\n",
+ le32_to_cpu(ino->xattr_names));
+ printk(KERN_DEBUG "\tcompr_type %#x\n",
+ (int)le16_to_cpu(ino->compr_type));
+ printk(KERN_DEBUG "\tdata len %u\n",
+ le32_to_cpu(ino->data_len));
+ break;
+ }
+ case UBIFS_DENT_NODE:
+ case UBIFS_XENT_NODE:
+ {
+ const struct ubifs_dent_node *dent = node;
+ int nlen = le16_to_cpu(dent->nlen);
+
+ key_read(c, &dent->key, &key);
+ printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tinum %llu\n",
+ (unsigned long long)le64_to_cpu(dent->inum));
+ printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
+ printk(KERN_DEBUG "\tnlen %d\n", nlen);
+ printk(KERN_DEBUG "\tname ");
+
+ if (nlen > UBIFS_MAX_NLEN)
+ printk(KERN_DEBUG "(bad name length, not printing, "
+ "bad or corrupted node)");
+ else {
+ for (i = 0; i < nlen && dent->name[i]; i++)
+ printk("%c", dent->name[i]);
+ }
+ printk("\n");
+
+ break;
+ }
+ case UBIFS_DATA_NODE:
+ {
+ const struct ubifs_data_node *dn = node;
+ int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
+
+ key_read(c, &dn->key, &key);
+ printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tsize %u\n",
+ le32_to_cpu(dn->size));
+ printk(KERN_DEBUG "\tcompr_typ %d\n",
+ (int)le16_to_cpu(dn->compr_type));
+ printk(KERN_DEBUG "\tdata size %d\n",
+ dlen);
+ printk(KERN_DEBUG "\tdata:\n");
+ print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+ (void *)&dn->data, dlen, 0);
+ break;
+ }
+ case UBIFS_TRUN_NODE:
+ {
+ const struct ubifs_trun_node *trun = node;
+
+ printk(KERN_DEBUG "\tinum %u\n",
+ le32_to_cpu(trun->inum));
+ printk(KERN_DEBUG "\told_size %llu\n",
+ (unsigned long long)le64_to_cpu(trun->old_size));
+ printk(KERN_DEBUG "\tnew_size %llu\n",
+ (unsigned long long)le64_to_cpu(trun->new_size));
+ break;
+ }
+ case UBIFS_IDX_NODE:
+ {
+ const struct ubifs_idx_node *idx = node;
+
+ n = le16_to_cpu(idx->child_cnt);
+ printk(KERN_DEBUG "\tchild_cnt %d\n", n);
+ printk(KERN_DEBUG "\tlevel %d\n",
+ (int)le16_to_cpu(idx->level));
+ printk(KERN_DEBUG "\tBranches:\n");
+
+ for (i = 0; i < n && i < c->fanout - 1; i++) {
+ const struct ubifs_branch *br;
+
+ br = ubifs_idx_branch(c, idx, i);
+ key_read(c, &br->key, &key);
+ printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+ i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
+ le32_to_cpu(br->len), DBGKEY(&key));
+ }
+ break;
+ }
+ case UBIFS_CS_NODE:
+ break;
+ case UBIFS_ORPH_NODE:
+ {
+ const struct ubifs_orph_node *orph = node;
+
+ printk(KERN_DEBUG "\tcommit number %llu\n",
+ (unsigned long long)
+ le64_to_cpu(orph->cmt_no) & LLONG_MAX);
+ printk(KERN_DEBUG "\tlast node flag %llu\n",
+ (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
+ n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+ printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+ for (i = 0; i < n; i++)
+ printk(KERN_DEBUG "\t ino %llu\n",
+ le64_to_cpu(orph->inos[i]));
+ break;
+ }
+ default:
+ printk(KERN_DEBUG "node type %d was not recognized\n",
+ (int)ch->node_type);
+ }
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+{
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+ req->new_ino, req->dirtied_ino);
+ printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n",
+ req->new_ino_d, req->dirtied_ino_d);
+ printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n",
+ req->new_page, req->dirtied_page);
+ printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n",
+ req->new_dent, req->mod_dent);
+ printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth);
+ printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n",
+ req->data_growth, req->dd_growth);
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+{
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs %d\n",
+ lst->empty_lebs, lst->idx_lebs);
+ printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+ "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
+ lst->total_dirty);
+ printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+ "total_dead %lld\n", lst->total_used, lst->total_dark,
+ lst->total_dead);
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budg(struct ubifs_info *c)
+{
+ int i;
+ struct rb_node *rb;
+ struct ubifs_bud *bud;
+ struct ubifs_gced_idx_leb *idx_gc;
+
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
+ "budg_dd_growth %lld, budg_idx_growth %lld\n",
+ c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+ printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+ "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+ c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+ c->freeable_cnt);
+ printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+ "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+ c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+ printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+ "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
+ atomic_long_read(&c->dirty_zn_cnt),
+ atomic_long_read(&c->clean_zn_cnt));
+ printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+ c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+ printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+ c->gc_lnum, c->ihead_lnum);
+ for (i = 0; i < c->jhead_cnt; i++)
+ printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+ c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+ for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
+ bud = rb_entry(rb, struct ubifs_bud, rb);
+ printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+ }
+ list_for_each_entry(bud, &c->old_buds, list)
+ printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+ list_for_each_entry(idx_gc, &c->idx_gc, list)
+ printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+ idx_gc->lnum, idx_gc->unmap);
+ printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+{
+ printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
+ "flags %#x\n", lp->lnum, lp->free, lp->dirty,
+ c->leb_size - lp->free - lp->dirty, lp->flags);
+}
+
+void dbg_dump_lprops(struct ubifs_info *c)
+{
+ int lnum, err;
+ struct ubifs_lprops lp;
+ struct ubifs_lp_stats lst;
+
+ printk(KERN_DEBUG "Dumping LEB properties\n");
+ ubifs_get_lp_stats(c, &lst);
+ dbg_dump_lstats(&lst);
+
+ for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+ err = ubifs_read_one_lp(c, lnum, &lp);
+ if (err)
+ ubifs_err("cannot read lprops for LEB %d", lnum);
+
+ dbg_dump_lprop(c, &lp);
+ }
+}
+
+void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+
+ if (dbg_failure_mode)
+ return;
+
+ printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+
+ sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ if (IS_ERR(sleb)) {
+ ubifs_err("scan error %d", (int)PTR_ERR(sleb));
+ return;
+ }
+
+ printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+ sleb->nodes_cnt, sleb->endpt);
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ cond_resched();
+ printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+ snod->offs, snod->len);
+ dbg_dump_node(c, snod->node);
+ }
+
+ ubifs_scan_destroy(sleb);
+ return;
+}
+
+void dbg_dump_znode(const struct ubifs_info *c,
+ const struct ubifs_znode *znode)
+{
+ int n;
+ const struct ubifs_zbranch *zbr;
+
+ spin_lock(&dbg_lock);
+ if (znode->parent)
+ zbr = &znode->parent->zbranch[znode->iip];
+ else
+ zbr = &c->zroot;
+
+ printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+ " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
+ zbr->len, znode->parent, znode->iip, znode->level,
+ znode->child_cnt, znode->flags);
+
+ if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+ spin_unlock(&dbg_lock);
+ return;
+ }
+
+ printk(KERN_DEBUG "zbranches:\n");
+ for (n = 0; n < znode->child_cnt; n++) {
+ zbr = &znode->zbranch[n];
+ if (znode->level > 0)
+ printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+ "%s\n", n, zbr->znode, zbr->lnum,
+ zbr->offs, zbr->len,
+ DBGKEY(&zbr->key));
+ else
+ printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+ "%s\n", n, zbr->znode, zbr->lnum,
+ zbr->offs, zbr->len,
+ DBGKEY(&zbr->key));
+ }
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+{
+ int i;
+
+ printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
+ cat, heap->cnt);
+ for (i = 0; i < heap->cnt; i++) {
+ struct ubifs_lprops *lprops = heap->arr[i];
+
+ printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+ "flags %d\n", i, lprops->lnum, lprops->hpos,
+ lprops->free, lprops->dirty, lprops->flags);
+ }
+}
+
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i;
+
+ printk(KERN_DEBUG "Dumping pnode:\n");
+ printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+ (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
+ printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+ pnode->flags, iip, pnode->level, pnode->num);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops *lp = &pnode->lprops[i];
+
+ printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+ i, lp->free, lp->dirty, lp->flags, lp->lnum);
+ }
+}
+
+void dbg_dump_tnc(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode;
+ int level;
+
+ printk(KERN_DEBUG "\n");
+ printk(KERN_DEBUG "Dumping the TNC tree\n");
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+ level = znode->level;
+ printk(KERN_DEBUG "== Level %d ==\n", level);
+ while (znode) {
+ if (level != znode->level) {
+ level = znode->level;
+ printk(KERN_DEBUG "== Level %d ==\n", level);
+ }
+ dbg_dump_znode(c, znode);
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+ }
+
+ printk(KERN_DEBUG "\n");
+}
+
+static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
+ void *priv)
+{
+ dbg_dump_znode(c, znode);
+ return 0;
+}
+
+/**
+ * dbg_dump_index - dump the on-flash index.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * which dumps only in-memory znodes and does not read znodes which from flash.
+ */
+void dbg_dump_index(struct ubifs_info *c)
+{
+ dbg_walk_index(c, NULL, dump_znode, NULL);
+}
+
+/**
+ * dbg_check_synced_i_size - check synchronized inode size.
+ * @inode: inode to check
+ *
+ * If inode is clean, synchronized inode size has to be equivalent to current
+ * inode size. This function has to be called only for locked inodes (@i_mutex
+ * has to be locked). Returns %0 if synchronized inode size if correct, and
+ * %-EINVAL if not.
+ */
+int dbg_check_synced_i_size(struct inode *inode)
+{
+ int err = 0;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
+ mutex_lock(&ui->ui_mutex);
+ spin_lock(&ui->ui_lock);
+ if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
+ ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
+ "is clean", ui->ui_size, ui->synced_i_size);
+ ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+ inode->i_mode, i_size_read(inode));
+ dbg_dump_stack();
+ err = -EINVAL;
+ }
+ spin_unlock(&ui->ui_lock);
+ mutex_unlock(&ui->ui_mutex);
+ return err;
+}
+
+/*
+ * dbg_check_dir - check directory inode size and link count.
+ * @c: UBIFS file-system description object
+ * @dir: the directory to calculate size for
+ * @size: the result is returned here
+ *
+ * This function makes sure that directory size and link count are correct.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, it is good idea to make sure the @dir->i_mutex is locked before
+ * calling this function.
+ */
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
+{
+ unsigned int nlink = 2;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent, *pdent = NULL;
+ struct qstr nm = { .name = NULL };
+ loff_t size = UBIFS_INO_NODE_SZ;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ if (!S_ISDIR(dir->i_mode))
+ return 0;
+
+ lowest_dent_key(c, &key, dir->i_ino);
+ while (1) {
+ int err;
+
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -ENOENT)
+ break;
+ return err;
+ }
+
+ nm.name = dent->name;
+ nm.len = le16_to_cpu(dent->nlen);
+ size += CALC_DENT_SIZE(nm.len);
+ if (dent->type == UBIFS_ITYPE_DIR)
+ nlink += 1;
+ kfree(pdent);
+ pdent = dent;
+ key_read(c, &dent->key, &key);
+ }
+ kfree(pdent);
+
+ if (i_size_read(dir) != size) {
+ ubifs_err("directory inode %lu has size %llu, "
+ "but calculated size is %llu", dir->i_ino,
+ (unsigned long long)i_size_read(dir),
+ (unsigned long long)size);
+ dump_stack();
+ return -EINVAL;
+ }
+ if (dir->i_nlink != nlink) {
+ ubifs_err("directory inode %lu has nlink %u, but calculated "
+ "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
+ dump_stack();
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * dbg_check_key_order - make sure that colliding keys are properly ordered.
+ * @c: UBIFS file-system description object
+ * @zbr1: first zbranch
+ * @zbr2: following zbranch
+ *
+ * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
+ * names of the direntries/xentries which are referred by the keys. This
+ * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
+ * sure the name of direntry/xentry referred by @zbr1 is less than
+ * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
+ * and a negative error code in case of failure.
+ */
+static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
+ struct ubifs_zbranch *zbr2)
+{
+ int err, nlen1, nlen2, cmp;
+ struct ubifs_dent_node *dent1, *dent2;
+ union ubifs_key key;
+
+ ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
+ dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+ if (!dent1)
+ return -ENOMEM;
+ dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+ if (!dent2) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ err = ubifs_tnc_read_node(c, zbr1, dent1);
+ if (err)
+ goto out_free;
+ err = ubifs_validate_entry(c, dent1);
+ if (err)
+ goto out_free;
+
+ err = ubifs_tnc_read_node(c, zbr2, dent2);
+ if (err)
+ goto out_free;
+ err = ubifs_validate_entry(c, dent2);
+ if (err)
+ goto out_free;
+
+ /* Make sure node keys are the same as in zbranch */
+ err = 1;
+ key_read(c, &dent1->key, &key);
+ if (keys_cmp(c, &zbr1->key, &key)) {
+ dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
+ zbr1->offs, DBGKEY(&key));
+ dbg_err("but it should have key %s according to tnc",
+ DBGKEY(&zbr1->key));
+ dbg_dump_node(c, dent1);
+ goto out_free;
+ }
+
+ key_read(c, &dent2->key, &key);
+ if (keys_cmp(c, &zbr2->key, &key)) {
+ dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+ zbr1->offs, DBGKEY(&key));
+ dbg_err("but it should have key %s according to tnc",
+ DBGKEY(&zbr2->key));
+ dbg_dump_node(c, dent2);
+ goto out_free;
+ }
+
+ nlen1 = le16_to_cpu(dent1->nlen);
+ nlen2 = le16_to_cpu(dent2->nlen);
+
+ cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
+ if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
+ err = 0;
+ goto out_free;
+ }
+ if (cmp == 0 && nlen1 == nlen2)
+ dbg_err("2 xent/dent nodes with the same name");
+ else
+ dbg_err("bad order of colliding key %s",
+ DBGKEY(&key));
+
+ dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+ dbg_dump_node(c, dent1);
+ dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+ dbg_dump_node(c, dent2);
+
+out_free:
+ kfree(dent2);
+ kfree(dent1);
+ return err;
+}
+
+/**
+ * dbg_check_znode - check if znode is all right.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch which points to this znode
+ *
+ * This function makes sure that znode referred to by @zbr is all right.
+ * Returns zero if it is, and %-EINVAL if it is not.
+ */
+static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
+{
+ struct ubifs_znode *znode = zbr->znode;
+ struct ubifs_znode *zp = znode->parent;
+ int n, err, cmp;
+
+ if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+ err = 1;
+ goto out;
+ }
+ if (znode->level < 0) {
+ err = 2;
+ goto out;
+ }
+ if (znode->iip < 0 || znode->iip >= c->fanout) {
+ err = 3;
+ goto out;
+ }
+
+ if (zbr->len == 0)
+ /* Only dirty zbranch may have no on-flash nodes */
+ if (!ubifs_zn_dirty(znode)) {
+ err = 4;
+ goto out;
+ }
+
+ if (ubifs_zn_dirty(znode)) {
+ /*
+ * If znode is dirty, its parent has to be dirty as well. The
+ * order of the operation is important, so we have to have
+ * memory barriers.
+ */
+ smp_mb();
+ if (zp && !ubifs_zn_dirty(zp)) {
+ /*
+ * The dirty flag is atomic and is cleared outside the
+ * TNC mutex, so znode's dirty flag may now have
+ * been cleared. The child is always cleared before the
+ * parent, so we just need to check again.
+ */
+ smp_mb();
+ if (ubifs_zn_dirty(znode)) {
+ err = 5;
+ goto out;
+ }
+ }
+ }
+
+ if (zp) {
+ const union ubifs_key *min, *max;
+
+ if (znode->level != zp->level - 1) {
+ err = 6;
+ goto out;
+ }
+
+ /* Make sure the 'parent' pointer in our znode is correct */
+ err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
+ if (!err) {
+ /* This zbranch does not exist in the parent */
+ err = 7;
+ goto out;
+ }
+
+ if (znode->iip >= zp->child_cnt) {
+ err = 8;
+ goto out;
+ }
+
+ if (znode->iip != n) {
+ /* This may happen only in case of collisions */
+ if (keys_cmp(c, &zp->zbranch[n].key,
+ &zp->zbranch[znode->iip].key)) {
+ err = 9;
+ goto out;
+ }
+ n = znode->iip;
+ }
+
+ /*
+ * Make sure that the first key in our znode is greater than or
+ * equal to the key in the pointing zbranch.
+ */
+ min = &zbr->key;
+ cmp = keys_cmp(c, min, &znode->zbranch[0].key);
+ if (cmp == 1) {
+ err = 10;
+ goto out;
+ }
+
+ if (n + 1 < zp->child_cnt) {
+ max = &zp->zbranch[n + 1].key;
+
+ /*
+ * Make sure the last key in our znode is less or
+ * equivalent than the the key in zbranch which goes
+ * after our pointing zbranch.
+ */
+ cmp = keys_cmp(c, max,
+ &znode->zbranch[znode->child_cnt - 1].key);
+ if (cmp == -1) {
+ err = 11;
+ goto out;
+ }
+ }
+ } else {
+ /* This may only be root znode */
+ if (zbr != &c->zroot) {
+ err = 12;
+ goto out;
+ }
+ }
+
+ /*
+ * Make sure that next key is greater or equivalent then the previous
+ * one.
+ */
+ for (n = 1; n < znode->child_cnt; n++) {
+ cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
+ &znode->zbranch[n].key);
+ if (cmp > 0) {
+ err = 13;
+ goto out;
+ }
+ if (cmp == 0) {
+ /* This can only be keys with colliding hash */
+ if (!is_hash_key(c, &znode->zbranch[n].key)) {
+ err = 14;
+ goto out;
+ }
+
+ if (znode->level != 0 || c->replaying)
+ continue;
+
+ /*
+ * Colliding keys should follow binary order of
+ * corresponding xentry/dentry names.
+ */
+ err = dbg_check_key_order(c, &znode->zbranch[n - 1],
+ &znode->zbranch[n]);
+ if (err < 0)
+ return err;
+ if (err) {
+ err = 15;
+ goto out;
+ }
+ }
+ }
+
+ for (n = 0; n < znode->child_cnt; n++) {
+ if (!znode->zbranch[n].znode &&
+ (znode->zbranch[n].lnum == 0 ||
+ znode->zbranch[n].len == 0)) {
+ err = 16;
+ goto out;
+ }
+
+ if (znode->zbranch[n].lnum != 0 &&
+ znode->zbranch[n].len == 0) {
+ err = 17;
+ goto out;
+ }
+
+ if (znode->zbranch[n].lnum == 0 &&
+ znode->zbranch[n].len != 0) {
+ err = 18;
+ goto out;
+ }
+
+ if (znode->zbranch[n].lnum == 0 &&
+ znode->zbranch[n].offs != 0) {
+ err = 19;
+ goto out;
+ }
+
+ if (znode->level != 0 && znode->zbranch[n].znode)
+ if (znode->zbranch[n].znode->parent != znode) {
+ err = 20;
+ goto out;
+ }
+ }
+
+ return 0;
+
+out:
+ ubifs_err("failed, error %d", err);
+ ubifs_msg("dump of the znode");
+ dbg_dump_znode(c, znode);
+ if (zp) {
+ ubifs_msg("dump of the parent znode");
+ dbg_dump_znode(c, zp);
+ }
+ dump_stack();
+ return -EINVAL;
+}
+
+/**
+ * dbg_check_tnc - check TNC tree.
+ * @c: UBIFS file-system description object
+ * @extra: do extra checks that are possible at start commit
+ *
+ * This function traverses whole TNC tree and checks every znode. Returns zero
+ * if everything is all right and %-EINVAL if something is wrong with TNC.
+ */
+int dbg_check_tnc(struct ubifs_info *c, int extra)
+{
+ struct ubifs_znode *znode;
+ long clean_cnt = 0, dirty_cnt = 0;
+ int err, last;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
+ return 0;
+
+ ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+ if (!c->zroot.znode)
+ return 0;
+
+ znode = ubifs_tnc_postorder_first(c->zroot.znode);
+ while (1) {
+ struct ubifs_znode *prev;
+ struct ubifs_zbranch *zbr;
+
+ if (!znode->parent)
+ zbr = &c->zroot;
+ else
+ zbr = &znode->parent->zbranch[znode->iip];
+
+ err = dbg_check_znode(c, zbr);
+ if (err)
+ return err;
+
+ if (extra) {
+ if (ubifs_zn_dirty(znode))
+ dirty_cnt += 1;
+ else
+ clean_cnt += 1;
+ }
+
+ prev = znode;
+ znode = ubifs_tnc_postorder_next(znode);
+ if (!znode)
+ break;
+
+ /*
+ * If the last key of this znode is equivalent to the first key
+ * of the next znode (collision), then check order of the keys.
+ */
+ last = prev->child_cnt - 1;
+ if (prev->level == 0 && znode->level == 0 && !c->replaying &&
+ !keys_cmp(c, &prev->zbranch[last].key,
+ &znode->zbranch[0].key)) {
+ err = dbg_check_key_order(c, &prev->zbranch[last],
+ &znode->zbranch[0]);
+ if (err < 0)
+ return err;
+ if (err) {
+ ubifs_msg("first znode");
+ dbg_dump_znode(c, prev);
+ ubifs_msg("second znode");
+ dbg_dump_znode(c, znode);
+ return -EINVAL;
+ }
+ }
+ }
+
+ if (extra) {
+ if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
+ ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
+ atomic_long_read(&c->clean_zn_cnt),
+ clean_cnt);
+ return -EINVAL;
+ }
+ if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
+ ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
+ atomic_long_read(&c->dirty_zn_cnt),
+ dirty_cnt);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * dbg_walk_index - walk the on-flash index.
+ * @c: UBIFS file-system description object
+ * @leaf_cb: called for each leaf node
+ * @znode_cb: called for each indexing node
+ * @priv: private date which is passed to callbacks
+ *
+ * This function walks the UBIFS index and calls the @leaf_cb for each leaf
+ * node and @znode_cb for each indexing node. Returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * It would be better if this function removed every znode it pulled to into
+ * the TNC, so that the behavior more closely matched the non-debugging
+ * behavior.
+ */
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+ dbg_znode_callback znode_cb, void *priv)
+{
+ int err;
+ struct ubifs_zbranch *zbr;
+ struct ubifs_znode *znode, *child;
+
+ mutex_lock(&c->tnc_mutex);
+ /* If the root indexing node is not in TNC - pull it */
+ if (!c->zroot.znode) {
+ c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(c->zroot.znode)) {
+ err = PTR_ERR(c->zroot.znode);
+ c->zroot.znode = NULL;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * We are going to traverse the indexing tree in the postorder manner.
+ * Go down and find the leftmost indexing node where we are going to
+ * start from.
+ */
+ znode = c->zroot.znode;
+ while (znode->level > 0) {
+ zbr = &znode->zbranch[0];
+ child = zbr->znode;
+ if (!child) {
+ child = ubifs_load_znode(c, zbr, znode, 0);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto out_unlock;
+ }
+ zbr->znode = child;
+ }
+
+ znode = child;
+ }
+
+ /* Iterate over all indexing nodes */
+ while (1) {
+ int idx;
+
+ cond_resched();
+
+ if (znode_cb) {
+ err = znode_cb(c, znode, priv);
+ if (err) {
+ ubifs_err("znode checking function returned "
+ "error %d", err);
+ dbg_dump_znode(c, znode);
+ goto out_dump;
+ }
+ }
+ if (leaf_cb && znode->level == 0) {
+ for (idx = 0; idx < znode->child_cnt; idx++) {
+ zbr = &znode->zbranch[idx];
+ err = leaf_cb(c, zbr, priv);
+ if (err) {
+ ubifs_err("leaf checking function "
+ "returned error %d, for leaf "
+ "at LEB %d:%d",
+ err, zbr->lnum, zbr->offs);
+ goto out_dump;
+ }
+ }
+ }
+
+ if (!znode->parent)
+ break;
+
+ idx = znode->iip + 1;
+ znode = znode->parent;
+ if (idx < znode->child_cnt) {
+ /* Switch to the next index in the parent */
+ zbr = &znode->zbranch[idx];
+ child = zbr->znode;
+ if (!child) {
+ child = ubifs_load_znode(c, zbr, znode, idx);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto out_unlock;
+ }
+ zbr->znode = child;
+ }
+ znode = child;
+ } else
+ /*
+ * This is the last child, switch to the parent and
+ * continue.
+ */
+ continue;
+
+ /* Go to the lowest leftmost znode in the new sub-tree */
+ while (znode->level > 0) {
+ zbr = &znode->zbranch[0];
+ child = zbr->znode;
+ if (!child) {
+ child = ubifs_load_znode(c, zbr, znode, 0);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto out_unlock;
+ }
+ zbr->znode = child;
+ }
+ znode = child;
+ }
+ }
+
+ mutex_unlock(&c->tnc_mutex);
+ return 0;
+
+out_dump:
+ if (znode->parent)
+ zbr = &znode->parent->zbranch[znode->iip];
+ else
+ zbr = &c->zroot;
+ ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+ dbg_dump_znode(c, znode);
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * add_size - add znode size to partially calculated index size.
+ * @c: UBIFS file-system description object
+ * @znode: znode to add size for
+ * @priv: partially calculated index size
+ *
+ * This is a helper function for 'dbg_check_idx_size()' which is called for
+ * every indexing node and adds its size to the 'long long' variable pointed to
+ * by @priv.
+ */
+static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
+{
+ long long *idx_size = priv;
+ int add;
+
+ add = ubifs_idx_node_sz(c, znode->child_cnt);
+ add = ALIGN(add, 8);
+ *idx_size += add;
+ return 0;
+}
+
+/**
+ * dbg_check_idx_size - check index size.
+ * @c: UBIFS file-system description object
+ * @idx_size: size to check
+ *
+ * This function walks the UBIFS index, calculates its size and checks that the
+ * size is equivalent to @idx_size. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
+{
+ int err;
+ long long calc = 0;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
+ return 0;
+
+ err = dbg_walk_index(c, NULL, add_size, &calc);
+ if (err) {
+ ubifs_err("error %d while walking the index", err);
+ return err;
+ }
+
+ if (calc != idx_size) {
+ ubifs_err("index size check failed: calculated size is %lld, "
+ "should be %lld", calc, idx_size);
+ dump_stack();
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * struct fsck_inode - information about an inode used when checking the file-system.
+ * @rb: link in the RB-tree of inodes
+ * @inum: inode number
+ * @mode: inode type, permissions, etc
+ * @nlink: inode link count
+ * @xattr_cnt: count of extended attributes
+ * @references: how many directory/xattr entries refer this inode (calculated
+ * while walking the index)
+ * @calc_cnt: for directory inode count of child directories
+ * @size: inode size (read from on-flash inode)
+ * @xattr_sz: summary size of all extended attributes (read from on-flash
+ * inode)
+ * @calc_sz: for directories calculated directory size
+ * @calc_xcnt: count of extended attributes
+ * @calc_xsz: calculated summary size of all extended attributes
+ * @xattr_nms: sum of lengths of all extended attribute names belonging to this
+ * inode (read from on-flash inode)
+ * @calc_xnms: calculated sum of lengths of all extended attribute names
+ */
+struct fsck_inode {
+ struct rb_node rb;
+ ino_t inum;
+ umode_t mode;
+ unsigned int nlink;
+ unsigned int xattr_cnt;
+ int references;
+ int calc_cnt;
+ long long size;
+ unsigned int xattr_sz;
+ long long calc_sz;
+ long long calc_xcnt;
+ long long calc_xsz;
+ unsigned int xattr_nms;
+ long long calc_xnms;
+};
+
+/**
+ * struct fsck_data - private FS checking information.
+ * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
+ */
+struct fsck_data {
+ struct rb_root inodes;
+};
+
+/**
+ * add_inode - add inode information to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @ino: raw UBIFS inode to add
+ *
+ * This is a helper function for 'check_leaf()' which adds information about
+ * inode @ino to the RB-tree of inodes. Returns inode information pointer in
+ * case of success and a negative error code in case of failure.
+ */
+static struct fsck_inode *add_inode(struct ubifs_info *c,
+ struct fsck_data *fsckd,
+ struct ubifs_ino_node *ino)
+{
+ struct rb_node **p, *parent = NULL;
+ struct fsck_inode *fscki;
+ ino_t inum = key_inum_flash(c, &ino->key);
+
+ p = &fsckd->inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ fscki = rb_entry(parent, struct fsck_inode, rb);
+ if (inum < fscki->inum)
+ p = &(*p)->rb_left;
+ else if (inum > fscki->inum)
+ p = &(*p)->rb_right;
+ else
+ return fscki;
+ }
+
+ if (inum > c->highest_inum) {
+ ubifs_err("too high inode number, max. is %lu",
+ c->highest_inum);
+ return ERR_PTR(-EINVAL);
+ }
+
+ fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
+ if (!fscki)
+ return ERR_PTR(-ENOMEM);
+
+ fscki->inum = inum;
+ fscki->nlink = le32_to_cpu(ino->nlink);
+ fscki->size = le64_to_cpu(ino->size);
+ fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+ fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+ fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+ fscki->mode = le32_to_cpu(ino->mode);
+ if (S_ISDIR(fscki->mode)) {
+ fscki->calc_sz = UBIFS_INO_NODE_SZ;
+ fscki->calc_cnt = 2;
+ }
+ rb_link_node(&fscki->rb, parent, p);
+ rb_insert_color(&fscki->rb, &fsckd->inodes);
+ return fscki;
+}
+
+/**
+ * search_inode - search inode in the RB-tree of inodes.
+ * @fsckd: FS checking information
+ * @inum: inode number to search
+ *
+ * This is a helper function for 'check_leaf()' which searches inode @inum in
+ * the RB-tree of inodes and returns an inode information pointer or %NULL if
+ * the inode was not found.
+ */
+static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
+{
+ struct rb_node *p;
+ struct fsck_inode *fscki;
+
+ p = fsckd->inodes.rb_node;
+ while (p) {
+ fscki = rb_entry(p, struct fsck_inode, rb);
+ if (inum < fscki->inum)
+ p = p->rb_left;
+ else if (inum > fscki->inum)
+ p = p->rb_right;
+ else
+ return fscki;
+ }
+ return NULL;
+}
+
+/**
+ * read_add_inode - read inode node and add it to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @inum: inode number to read
+ *
+ * This is a helper function for 'check_leaf()' which finds inode node @inum in
+ * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
+ * information pointer in case of success and a negative error code in case of
+ * failure.
+ */
+static struct fsck_inode *read_add_inode(struct ubifs_info *c,
+ struct fsck_data *fsckd, ino_t inum)
+{
+ int n, err;
+ union ubifs_key key;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch *zbr;
+ struct ubifs_ino_node *ino;
+ struct fsck_inode *fscki;
+
+ fscki = search_inode(fsckd, inum);
+ if (fscki)
+ return fscki;
+
+ ino_key_init(c, &key, inum);
+ err = ubifs_lookup_level0(c, &key, &znode, &n);
+ if (!err) {
+ ubifs_err("inode %lu not found in index", inum);
+ return ERR_PTR(-ENOENT);
+ } else if (err < 0) {
+ ubifs_err("error %d while looking up inode %lu", err, inum);
+ return ERR_PTR(err);
+ }
+
+ zbr = &znode->zbranch[n];
+ if (zbr->len < UBIFS_INO_NODE_SZ) {
+ ubifs_err("bad node %lu node length %d", inum, zbr->len);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ino = kmalloc(zbr->len, GFP_NOFS);
+ if (!ino)
+ return ERR_PTR(-ENOMEM);
+
+ err = ubifs_tnc_read_node(c, zbr, ino);
+ if (err) {
+ ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+ zbr->lnum, zbr->offs, err);
+ kfree(ino);
+ return ERR_PTR(err);
+ }
+
+ fscki = add_inode(c, fsckd, ino);
+ kfree(ino);
+ if (IS_ERR(fscki)) {
+ ubifs_err("error %ld while adding inode %lu node",
+ PTR_ERR(fscki), inum);
+ return fscki;
+ }
+
+ return fscki;
+}
+
+/**
+ * check_leaf - check leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of the leaf node to check
+ * @priv: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which is called for
+ * every single leaf node while walking the indexing tree. It checks that the
+ * leaf node referred from the indexing tree exists, has correct CRC, and does
+ * some other basic validation. This function is also responsible for building
+ * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
+ * calculates reference count, size, etc for each inode in order to later
+ * compare them to the information stored inside the inodes and detect possible
+ * inconsistencies. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *priv)
+{
+ ino_t inum;
+ void *node;
+ struct ubifs_ch *ch;
+ int err, type = key_type(c, &zbr->key);
+ struct fsck_inode *fscki;
+
+ if (zbr->len < UBIFS_CH_SZ) {
+ ubifs_err("bad leaf length %d (LEB %d:%d)",
+ zbr->len, zbr->lnum, zbr->offs);
+ return -EINVAL;
+ }
+
+ node = kmalloc(zbr->len, GFP_NOFS);
+ if (!node)
+ return -ENOMEM;
+
+ err = ubifs_tnc_read_node(c, zbr, node);
+ if (err) {
+ ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
+ zbr->lnum, zbr->offs, err);
+ goto out_free;
+ }
+
+ /* If this is an inode node, add it to RB-tree of inodes */
+ if (type == UBIFS_INO_KEY) {
+ fscki = add_inode(c, priv, node);
+ if (IS_ERR(fscki)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while adding inode node", err);
+ goto out_dump;
+ }
+ goto out;
+ }
+
+ if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
+ type != UBIFS_DATA_KEY) {
+ ubifs_err("unexpected node type %d at LEB %d:%d",
+ type, zbr->lnum, zbr->offs);
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ ch = node;
+ if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
+ ubifs_err("too high sequence number, max. is %llu",
+ c->max_sqnum);
+ err = -EINVAL;
+ goto out_dump;
+ }
+
+ if (type == UBIFS_DATA_KEY) {
+ long long blk_offs;
+ struct ubifs_data_node *dn = node;
+
+ /*
+ * Search the inode node this data node belongs to and insert
+ * it to the RB-tree of inodes.
+ */
+ inum = key_inum_flash(c, &dn->key);
+ fscki = read_add_inode(c, priv, inum);
+ if (IS_ERR(fscki)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while processing data node and "
+ "trying to find inode node %lu", err, inum);
+ goto out_dump;
+ }
+
+ /* Make sure the data node is within inode size */
+ blk_offs = key_block_flash(c, &dn->key);
+ blk_offs <<= UBIFS_BLOCK_SHIFT;
+ blk_offs += le32_to_cpu(dn->size);
+ if (blk_offs > fscki->size) {
+ ubifs_err("data node at LEB %d:%d is not within inode "
+ "size %lld", zbr->lnum, zbr->offs,
+ fscki->size);
+ err = -EINVAL;
+ goto out_dump;
+ }
+ } else {
+ int nlen;
+ struct ubifs_dent_node *dent = node;
+ struct fsck_inode *fscki1;
+
+ err = ubifs_validate_entry(c, dent);
+ if (err)
+ goto out_dump;
+
+ /*
+ * Search the inode node this entry refers to and the parent
+ * inode node and insert them to the RB-tree of inodes.
+ */
+ inum = le64_to_cpu(dent->inum);
+ fscki = read_add_inode(c, priv, inum);
+ if (IS_ERR(fscki)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while processing entry node and "
+ "trying to find inode node %lu", err, inum);
+ goto out_dump;
+ }
+
+ /* Count how many direntries or xentries refers this inode */
+ fscki->references += 1;
+
+ inum = key_inum_flash(c, &dent->key);
+ fscki1 = read_add_inode(c, priv, inum);
+ if (IS_ERR(fscki1)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while processing entry node and "
+ "trying to find parent inode node %lu",
+ err, inum);
+ goto out_dump;
+ }
+
+ nlen = le16_to_cpu(dent->nlen);
+ if (type == UBIFS_XENT_KEY) {
+ fscki1->calc_xcnt += 1;
+ fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
+ fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
+ fscki1->calc_xnms += nlen;
+ } else {
+ fscki1->calc_sz += CALC_DENT_SIZE(nlen);
+ if (dent->type == UBIFS_ITYPE_DIR)
+ fscki1->calc_cnt += 1;
+ }
+ }
+
+out:
+ kfree(node);
+ return 0;
+
+out_dump:
+ ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+ dbg_dump_node(c, node);
+out_free:
+ kfree(node);
+ return err;
+}
+
+/**
+ * free_inodes - free RB-tree of inodes.
+ * @fsckd: FS checking information
+ */
+static void free_inodes(struct fsck_data *fsckd)
+{
+ struct rb_node *this = fsckd->inodes.rb_node;
+ struct fsck_inode *fscki;
+
+ while (this) {
+ if (this->rb_left)
+ this = this->rb_left;
+ else if (this->rb_right)
+ this = this->rb_right;
+ else {
+ fscki = rb_entry(this, struct fsck_inode, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &fscki->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(fscki);
+ }
+ }
+}
+
+/**
+ * check_inodes - checks all inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which walks the
+ * RB-tree of inodes after the index scan has been finished, and checks that
+ * inode nlink, size, etc are correct. Returns zero if inodes are fine,
+ * %-EINVAL if not, and a negative error code in case of failure.
+ */
+static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
+{
+ int n, err;
+ union ubifs_key key;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch *zbr;
+ struct ubifs_ino_node *ino;
+ struct fsck_inode *fscki;
+ struct rb_node *this = rb_first(&fsckd->inodes);
+
+ while (this) {
+ fscki = rb_entry(this, struct fsck_inode, rb);
+ this = rb_next(this);
+
+ if (S_ISDIR(fscki->mode)) {
+ /*
+ * Directories have to have exactly one reference (they
+ * cannot have hardlinks), although root inode is an
+ * exception.
+ */
+ if (fscki->inum != UBIFS_ROOT_INO &&
+ fscki->references != 1) {
+ ubifs_err("directory inode %lu has %d "
+ "direntries which refer it, but "
+ "should be 1", fscki->inum,
+ fscki->references);
+ goto out_dump;
+ }
+ if (fscki->inum == UBIFS_ROOT_INO &&
+ fscki->references != 0) {
+ ubifs_err("root inode %lu has non-zero (%d) "
+ "direntries which refer it",
+ fscki->inum, fscki->references);
+ goto out_dump;
+ }
+ if (fscki->calc_sz != fscki->size) {
+ ubifs_err("directory inode %lu size is %lld, "
+ "but calculated size is %lld",
+ fscki->inum, fscki->size,
+ fscki->calc_sz);
+ goto out_dump;
+ }
+ if (fscki->calc_cnt != fscki->nlink) {
+ ubifs_err("directory inode %lu nlink is %d, "
+ "but calculated nlink is %d",
+ fscki->inum, fscki->nlink,
+ fscki->calc_cnt);
+ goto out_dump;
+ }
+ } else {
+ if (fscki->references != fscki->nlink) {
+ ubifs_err("inode %lu nlink is %d, but "
+ "calculated nlink is %d", fscki->inum,
+ fscki->nlink, fscki->references);
+ goto out_dump;
+ }
+ }
+ if (fscki->xattr_sz != fscki->calc_xsz) {
+ ubifs_err("inode %lu has xattr size %u, but "
+ "calculated size is %lld",
+ fscki->inum, fscki->xattr_sz,
+ fscki->calc_xsz);
+ goto out_dump;
+ }
+ if (fscki->xattr_cnt != fscki->calc_xcnt) {
+ ubifs_err("inode %lu has %u xattrs, but "
+ "calculated count is %lld", fscki->inum,
+ fscki->xattr_cnt, fscki->calc_xcnt);
+ goto out_dump;
+ }
+ if (fscki->xattr_nms != fscki->calc_xnms) {
+ ubifs_err("inode %lu has xattr names' size %u, but "
+ "calculated names' size is %lld",
+ fscki->inum, fscki->xattr_nms,
+ fscki->calc_xnms);
+ goto out_dump;
+ }
+ }
+
+ return 0;
+
+out_dump:
+ /* Read the bad inode and dump it */
+ ino_key_init(c, &key, fscki->inum);
+ err = ubifs_lookup_level0(c, &key, &znode, &n);
+ if (!err) {
+ ubifs_err("inode %lu not found in index", fscki->inum);
+ return -ENOENT;
+ } else if (err < 0) {
+ ubifs_err("error %d while looking up inode %lu",
+ err, fscki->inum);
+ return err;
+ }
+
+ zbr = &znode->zbranch[n];
+ ino = kmalloc(zbr->len, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ err = ubifs_tnc_read_node(c, zbr, ino);
+ if (err) {
+ ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+ zbr->lnum, zbr->offs, err);
+ kfree(ino);
+ return err;
+ }
+
+ ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
+ fscki->inum, zbr->lnum, zbr->offs);
+ dbg_dump_node(c, ino);
+ kfree(ino);
+ return -EINVAL;
+}
+
+/**
+ * dbg_check_filesystem - check the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks the file system, namely:
+ * o makes sure that all leaf nodes exist and their CRCs are correct;
+ * o makes sure inode nlink, size, xattr size/count are correct (for all
+ * inodes).
+ *
+ * The function reads whole indexing tree and all nodes, so it is pretty
+ * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
+ * not, and a negative error code in case of failure.
+ */
+int dbg_check_filesystem(struct ubifs_info *c)
+{
+ int err;
+ struct fsck_data fsckd;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_FS))
+ return 0;
+
+ fsckd.inodes = RB_ROOT;
+ err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
+ if (err)
+ goto out_free;
+
+ err = check_inodes(c, &fsckd);
+ if (err)
+ goto out_free;
+
+ free_inodes(&fsckd);
+ return 0;
+
+out_free:
+ ubifs_err("file-system check failed with error %d", err);
+ dump_stack();
+ free_inodes(&fsckd);
+ return err;
+}
+
+static int invocation_cnt;
+
+int dbg_force_in_the_gaps(void)
+{
+ if (!dbg_force_in_the_gaps_enabled)
+ return 0;
+ /* Force in-the-gaps every 8th commit */
+ return !((invocation_cnt++) & 0x7);
+}
+
+/* Failure mode for recovery testing */
+
+#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
+
+struct failure_mode_info {
+ struct list_head list;
+ struct ubifs_info *c;
+};
+
+static LIST_HEAD(fmi_list);
+static DEFINE_SPINLOCK(fmi_lock);
+
+static unsigned int next;
+
+static int simple_rand(void)
+{
+ if (next == 0)
+ next = current->pid;
+ next = next * 1103515245 + 12345;
+ return (next >> 16) & 32767;
+}
+
+void dbg_failure_mode_registration(struct ubifs_info *c)
+{
+ struct failure_mode_info *fmi;
+
+ fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
+ if (!fmi) {
+ dbg_err("Failed to register failure mode - no memory");
+ return;
+ }
+ fmi->c = c;
+ spin_lock(&fmi_lock);
+ list_add_tail(&fmi->list, &fmi_list);
+ spin_unlock(&fmi_lock);
+}
+
+void dbg_failure_mode_deregistration(struct ubifs_info *c)
+{
+ struct failure_mode_info *fmi, *tmp;
+
+ spin_lock(&fmi_lock);
+ list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
+ if (fmi->c == c) {
+ list_del(&fmi->list);
+ kfree(fmi);
+ }
+ spin_unlock(&fmi_lock);
+}
+
+static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
+{
+ struct failure_mode_info *fmi;
+
+ spin_lock(&fmi_lock);
+ list_for_each_entry(fmi, &fmi_list, list)
+ if (fmi->c->ubi == desc) {
+ struct ubifs_info *c = fmi->c;
+
+ spin_unlock(&fmi_lock);
+ return c;
+ }
+ spin_unlock(&fmi_lock);
+ return NULL;
+}
+
+static int in_failure_mode(struct ubi_volume_desc *desc)
+{
+ struct ubifs_info *c = dbg_find_info(desc);
+
+ if (c && dbg_failure_mode)
+ return c->failure_mode;
+ return 0;
+}
+
+static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
+{
+ struct ubifs_info *c = dbg_find_info(desc);
+
+ if (!c || !dbg_failure_mode)
+ return 0;
+ if (c->failure_mode)
+ return 1;
+ if (!c->fail_cnt) {
+ /* First call - decide delay to failure */
+ if (chance(1, 2)) {
+ unsigned int delay = 1 << (simple_rand() >> 11);
+
+ if (chance(1, 2)) {
+ c->fail_delay = 1;
+ c->fail_timeout = jiffies +
+ msecs_to_jiffies(delay);
+ dbg_rcvry("failing after %ums", delay);
+ } else {
+ c->fail_delay = 2;
+ c->fail_cnt_max = delay;
+ dbg_rcvry("failing after %u calls", delay);
+ }
+ }
+ c->fail_cnt += 1;
+ }
+ /* Determine if failure delay has expired */
+ if (c->fail_delay == 1) {
+ if (time_before(jiffies, c->fail_timeout))
+ return 0;
+ } else if (c->fail_delay == 2)
+ if (c->fail_cnt++ < c->fail_cnt_max)
+ return 0;
+ if (lnum == UBIFS_SB_LNUM) {
+ if (write) {
+ if (chance(1, 2))
+ return 0;
+ } else if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in super block LEB %d", lnum);
+ } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
+ if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in master LEB %d", lnum);
+ } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
+ if (write) {
+ if (chance(99, 100))
+ return 0;
+ } else if (chance(399, 400))
+ return 0;
+ dbg_rcvry("failing in log LEB %d", lnum);
+ } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
+ if (write) {
+ if (chance(7, 8))
+ return 0;
+ } else if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in LPT LEB %d", lnum);
+ } else if (lnum >= c->orph_first && lnum <= c->orph_last) {
+ if (write) {
+ if (chance(1, 2))
+ return 0;
+ } else if (chance(9, 10))
+ return 0;
+ dbg_rcvry("failing in orphan LEB %d", lnum);
+ } else if (lnum == c->ihead_lnum) {
+ if (chance(99, 100))
+ return 0;
+ dbg_rcvry("failing in index head LEB %d", lnum);
+ } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
+ if (chance(9, 10))
+ return 0;
+ dbg_rcvry("failing in GC head LEB %d", lnum);
+ } else if (write && !RB_EMPTY_ROOT(&c->buds) &&
+ !ubifs_search_bud(c, lnum)) {
+ if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in non-bud LEB %d", lnum);
+ } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
+ c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+ if (chance(999, 1000))
+ return 0;
+ dbg_rcvry("failing in bud LEB %d commit running", lnum);
+ } else {
+ if (chance(9999, 10000))
+ return 0;
+ dbg_rcvry("failing in bud LEB %d commit not running", lnum);
+ }
+ ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
+ c->failure_mode = 1;
+ dump_stack();
+ return 1;
+}
+
+static void cut_data(const void *buf, int len)
+{
+ int flen, i;
+ unsigned char *p = (void *)buf;
+
+ flen = (len * (long long)simple_rand()) >> 15;
+ for (i = flen; i < len; i++)
+ p[i] = 0xff;
+}
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+ int len, int check)
+{
+ if (in_failure_mode(desc))
+ return -EIO;
+ return ubi_leb_read(desc, lnum, buf, offset, len, check);
+}
+
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int offset, int len, int dtype)
+{
+ int err;
+
+ if (in_failure_mode(desc))
+ return -EIO;
+ if (do_fail(desc, lnum, 1))
+ cut_data(buf, len);
+ err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
+ if (err)
+ return err;
+ if (in_failure_mode(desc))
+ return -EIO;
+ return 0;
+}
+
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int len, int dtype)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 1))
+ return -EIO;
+ err = ubi_leb_change(desc, lnum, buf, len, dtype);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 1))
+ return -EIO;
+ return 0;
+}
+
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ err = ubi_leb_erase(desc, lnum);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ return 0;
+}
+
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ err = ubi_leb_unmap(desc, lnum);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ return 0;
+}
+
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
+{
+ if (in_failure_mode(desc))
+ return -EIO;
+ return ubi_is_mapped(desc, lnum);
+}
+
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ err = ubi_leb_map(desc, lnum, dtype);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 00000000000..3c4f1e93c9e
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,403 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+#ifndef __UBIFS_DEBUG_H__
+#define __UBIFS_DEBUG_H__
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+#define UBIFS_DBG(op) op
+
+#define ubifs_assert(expr) do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+ __func__, __LINE__, current->pid); \
+ dbg_dump_stack(); \
+ } \
+} while (0)
+
+#define ubifs_assert_cmt_locked(c) do { \
+ if (unlikely(down_write_trylock(&(c)->commit_sem))) { \
+ up_write(&(c)->commit_sem); \
+ printk(KERN_CRIT "commit lock is not locked!\n"); \
+ ubifs_assert(0); \
+ } \
+} while (0)
+
+#define dbg_dump_stack() do { \
+ if (!dbg_failure_mode) \
+ dump_stack(); \
+} while (0)
+
+/* Generic debugging messages */
+#define dbg_msg(fmt, ...) do { \
+ spin_lock(&dbg_lock); \
+ printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__); \
+ spin_unlock(&dbg_lock); \
+} while (0)
+
+#define dbg_do_msg(typ, fmt, ...) do { \
+ if (ubifs_msg_flags & typ) \
+ dbg_msg(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define dbg_err(fmt, ...) do { \
+ spin_lock(&dbg_lock); \
+ ubifs_err(fmt, ##__VA_ARGS__); \
+ spin_unlock(&dbg_lock); \
+} while (0)
+
+const char *dbg_key_str0(const struct ubifs_info *c,
+ const union ubifs_key *key);
+const char *dbg_key_str1(const struct ubifs_info *c,
+ const union ubifs_key *key);
+
+/*
+ * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * macros.
+ */
+#define DBGKEY(key) dbg_key_str0(c, (key))
+#define DBGKEY1(key) dbg_key_str1(c, (key))
+
+/* General messages */
+#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+
+/* Additional journal messages */
+#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+
+/* Additional TNC messages */
+#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+
+/* Additional lprops messages */
+#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+
+/* Additional LEB find messages */
+#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+
+/* Additional mount messages */
+#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+
+/* Additional I/O messages */
+#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+
+/* Additional commit messages */
+#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+
+/* Additional budgeting messages */
+#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+
+/* Additional log messages */
+#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+
+/* Additional gc messages */
+#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+
+/* Additional scan messages */
+#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+
+/* Additional recovery messages */
+#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+
+/*
+ * Debugging message type flags (must match msg_type_names in debug.c).
+ *
+ * UBIFS_MSG_GEN: general messages
+ * UBIFS_MSG_JNL: journal messages
+ * UBIFS_MSG_MNT: mount messages
+ * UBIFS_MSG_CMT: commit messages
+ * UBIFS_MSG_FIND: LEB find messages
+ * UBIFS_MSG_BUDG: budgeting messages
+ * UBIFS_MSG_GC: garbage collection messages
+ * UBIFS_MSG_TNC: TNC messages
+ * UBIFS_MSG_LP: lprops messages
+ * UBIFS_MSG_IO: I/O messages
+ * UBIFS_MSG_LOG: log messages
+ * UBIFS_MSG_SCAN: scan messages
+ * UBIFS_MSG_RCVRY: recovery messages
+ */
+enum {
+ UBIFS_MSG_GEN = 0x1,
+ UBIFS_MSG_JNL = 0x2,
+ UBIFS_MSG_MNT = 0x4,
+ UBIFS_MSG_CMT = 0x8,
+ UBIFS_MSG_FIND = 0x10,
+ UBIFS_MSG_BUDG = 0x20,
+ UBIFS_MSG_GC = 0x40,
+ UBIFS_MSG_TNC = 0x80,
+ UBIFS_MSG_LP = 0x100,
+ UBIFS_MSG_IO = 0x200,
+ UBIFS_MSG_LOG = 0x400,
+ UBIFS_MSG_SCAN = 0x800,
+ UBIFS_MSG_RCVRY = 0x1000,
+};
+
+/* Debugging message type flags for each default debug message level */
+#define UBIFS_MSG_LVL_0 0
+#define UBIFS_MSG_LVL_1 0x1
+#define UBIFS_MSG_LVL_2 0x7f
+#define UBIFS_MSG_LVL_3 0xffff
+
+/*
+ * Debugging check flags (must match chk_names in debug.c).
+ *
+ * UBIFS_CHK_GEN: general checks
+ * UBIFS_CHK_TNC: check TNC
+ * UBIFS_CHK_IDX_SZ: check index size
+ * UBIFS_CHK_ORPH: check orphans
+ * UBIFS_CHK_OLD_IDX: check the old index
+ * UBIFS_CHK_LPROPS: check lprops
+ * UBIFS_CHK_FS: check the file-system
+ */
+enum {
+ UBIFS_CHK_GEN = 0x1,
+ UBIFS_CHK_TNC = 0x2,
+ UBIFS_CHK_IDX_SZ = 0x4,
+ UBIFS_CHK_ORPH = 0x8,
+ UBIFS_CHK_OLD_IDX = 0x10,
+ UBIFS_CHK_LPROPS = 0x20,
+ UBIFS_CHK_FS = 0x40,
+};
+
+/*
+ * Special testing flags (must match tst_names in debug.c).
+ *
+ * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
+ * UBIFS_TST_RCVRY: failure mode for recovery testing
+ */
+enum {
+ UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
+ UBIFS_TST_RCVRY = 0x4,
+};
+
+#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
+#else
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
+#endif
+
+#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
+#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
+#else
+#define UBIFS_CHK_FLAGS_DEFAULT 0
+#endif
+
+extern spinlock_t dbg_lock;
+
+extern unsigned int ubifs_msg_flags;
+extern unsigned int ubifs_chk_flags;
+extern unsigned int ubifs_tst_flags;
+
+/* Dump functions */
+
+const char *dbg_ntype(int type);
+const char *dbg_cstate(int cmt_state);
+const char *dbg_get_key_dump(const struct ubifs_info *c,
+ const union ubifs_key *key);
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
+void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_budget_req(const struct ubifs_budget_req *req);
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
+void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
+void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_znode(const struct ubifs_info *c,
+ const struct ubifs_znode *znode);
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip);
+void dbg_dump_tnc(struct ubifs_info *c);
+void dbg_dump_index(struct ubifs_info *c);
+
+/* Checking helper functions */
+
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+ struct ubifs_znode *znode, void *priv);
+
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+ dbg_znode_callback znode_cb, void *priv);
+
+/* Checking functions */
+
+int dbg_check_lprops(struct ubifs_info *c);
+
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+
+int dbg_check_cats(struct ubifs_info *c);
+
+int dbg_check_ltab(struct ubifs_info *c);
+
+int dbg_check_synced_i_size(struct inode *inode);
+
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
+
+int dbg_check_tnc(struct ubifs_info *c, int extra);
+
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
+
+int dbg_check_filesystem(struct ubifs_info *c);
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+ int add_pos);
+
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+ int row, int col);
+
+/* Force the use of in-the-gaps method for testing */
+
+#define dbg_force_in_the_gaps_enabled \
+ (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+
+int dbg_force_in_the_gaps(void);
+
+/* Failure mode for recovery testing */
+
+#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
+
+void dbg_failure_mode_registration(struct ubifs_info *c);
+void dbg_failure_mode_deregistration(struct ubifs_info *c);
+
+#ifndef UBIFS_DBG_PRESERVE_UBI
+
+#define ubi_leb_read dbg_leb_read
+#define ubi_leb_write dbg_leb_write
+#define ubi_leb_change dbg_leb_change
+#define ubi_leb_erase dbg_leb_erase
+#define ubi_leb_unmap dbg_leb_unmap
+#define ubi_is_mapped dbg_is_mapped
+#define ubi_leb_map dbg_leb_map
+
+#endif
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+ int len, int check);
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int offset, int len, int dtype);
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int len, int dtype);
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
+
+static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
+ int offset, int len)
+{
+ return dbg_leb_read(desc, lnum, buf, offset, len, 0);
+}
+
+static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
+ const void *buf, int offset, int len)
+{
+ return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
+}
+
+static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
+ const void *buf, int len)
+{
+ return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
+}
+
+#else /* !CONFIG_UBIFS_FS_DEBUG */
+
+#define UBIFS_DBG(op)
+#define ubifs_assert(expr) ({})
+#define ubifs_assert_cmt_locked(c)
+#define dbg_dump_stack()
+#define dbg_err(fmt, ...) ({})
+#define dbg_msg(fmt, ...) ({})
+#define dbg_key(c, key, fmt, ...) ({})
+
+#define dbg_gen(fmt, ...) ({})
+#define dbg_jnl(fmt, ...) ({})
+#define dbg_tnc(fmt, ...) ({})
+#define dbg_lp(fmt, ...) ({})
+#define dbg_find(fmt, ...) ({})
+#define dbg_mnt(fmt, ...) ({})
+#define dbg_io(fmt, ...) ({})
+#define dbg_cmt(fmt, ...) ({})
+#define dbg_budg(fmt, ...) ({})
+#define dbg_log(fmt, ...) ({})
+#define dbg_gc(fmt, ...) ({})
+#define dbg_scan(fmt, ...) ({})
+#define dbg_rcvry(fmt, ...) ({})
+
+#define dbg_ntype(type) ""
+#define dbg_cstate(cmt_state) ""
+#define dbg_get_key_dump(c, key) ({})
+#define dbg_dump_inode(c, inode) ({})
+#define dbg_dump_node(c, node) ({})
+#define dbg_dump_budget_req(req) ({})
+#define dbg_dump_lstats(lst) ({})
+#define dbg_dump_budg(c) ({})
+#define dbg_dump_lprop(c, lp) ({})
+#define dbg_dump_lprops(c) ({})
+#define dbg_dump_leb(c, lnum) ({})
+#define dbg_dump_znode(c, znode) ({})
+#define dbg_dump_heap(c, heap, cat) ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c) ({})
+#define dbg_dump_index(c) ({})
+
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+
+#define dbg_old_index_check_init(c, zroot) 0
+#define dbg_check_old_index(c, zroot) 0
+
+#define dbg_check_cats(c) 0
+
+#define dbg_check_ltab(c) 0
+
+#define dbg_check_synced_i_size(inode) 0
+
+#define dbg_check_dir_size(c, dir) 0
+
+#define dbg_check_tnc(c, x) 0
+
+#define dbg_check_idx_size(c, idx_size) 0
+
+#define dbg_check_filesystem(c) 0
+
+#define dbg_check_heap(c, heap, cat, add_pos) ({})
+
+#define dbg_check_lprops(c) 0
+#define dbg_check_lpt_nodes(c, cnode, row, col) 0
+
+#define dbg_force_in_the_gaps_enabled 0
+#define dbg_force_in_the_gaps() 0
+
+#define dbg_failure_mode 0
+#define dbg_failure_mode_registration(c) ({})
+#define dbg_failure_mode_deregistration(c) ({})
+
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
+
+#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 00000000000..e90374be7d3
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1240 @@
+/* * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ * Zoltan Sogor
+ */
+
+/*
+ * This file implements directory operations.
+ *
+ * All FS operations in this file allocate budget before writing anything to the
+ * media. If they fail to allocate it, the error is returned. The only
+ * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
+ * if they unable to allocate the budget, because deletion %-ENOSPC failure is
+ * not what users are usually ready to get. UBIFS budgeting subsystem has some
+ * space reserved for these purposes.
+ *
+ * All operations in this file write all inodes which they change straight
+ * away, instead of marking them dirty. For example, 'ubifs_link()' changes
+ * @i_size of the parent inode and writes the parent inode together with the
+ * target inode. This was done to simplify file-system recovery which would
+ * otherwise be very difficult to do. The only exception is rename which marks
+ * the re-named inode dirty (because its @i_ctime is updated) but does not
+ * write it, but just marks it as dirty.
+ */
+
+#include "ubifs.h"
+
+/**
+ * inherit_flags - inherit flags of the parent inode.
+ * @dir: parent inode
+ * @mode: new inode mode flags
+ *
+ * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
+ * parent directory inode @dir. UBIFS inodes inherit the following flags:
+ * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
+ * sub-directory basis;
+ * o %UBIFS_SYNC_FL - useful for the same reasons;
+ * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
+ *
+ * This function returns the inherited flags.
+ */
+static int inherit_flags(const struct inode *dir, int mode)
+{
+ int flags;
+ const struct ubifs_inode *ui = ubifs_inode(dir);
+
+ if (!S_ISDIR(dir->i_mode))
+ /*
+ * The parent is not a directory, which means that an extended
+ * attribute inode is being created. No flags.
+ */
+ return 0;
+
+ flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
+ if (!S_ISDIR(mode))
+ /* The "DIRSYNC" flag only applies to directories */
+ flags &= ~UBIFS_DIRSYNC_FL;
+ return flags;
+}
+
+/**
+ * ubifs_new_inode - allocate new UBIFS inode object.
+ * @c: UBIFS file-system description object
+ * @dir: parent directory inode
+ * @mode: inode mode flags
+ *
+ * This function finds an unused inode number, allocates new inode and
+ * initializes it. Returns new inode in case of success and an error code in
+ * case of failure.
+ */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+ int mode)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+
+ inode = new_inode(c->vfs_sb);
+ ui = ubifs_inode(inode);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
+ * marking them dirty in file write path (see 'file_update_time()').
+ * UBIFS has to fully control "clean <-> dirty" transitions of inodes
+ * to make budgeting work.
+ */
+ inode->i_flags |= (S_NOCMTIME);
+
+ inode->i_uid = current->fsuid;
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current->fsgid;
+ inode->i_mode = mode;
+ inode->i_mtime = inode->i_atime = inode->i_ctime =
+ ubifs_current_time(inode);
+ inode->i_mapping->nrpages = 0;
+ /* Disable readahead */
+ inode->i_mapping->backing_dev_info = &c->bdi;
+
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &ubifs_file_address_operations;
+ inode->i_op = &ubifs_file_inode_operations;
+ inode->i_fop = &ubifs_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &ubifs_dir_inode_operations;
+ inode->i_fop = &ubifs_dir_operations;
+ inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
+ break;
+ case S_IFLNK:
+ inode->i_op = &ubifs_symlink_inode_operations;
+ break;
+ case S_IFSOCK:
+ case S_IFIFO:
+ case S_IFBLK:
+ case S_IFCHR:
+ inode->i_op = &ubifs_file_inode_operations;
+ break;
+ default:
+ BUG();
+ }
+
+ ui->flags = inherit_flags(dir, mode);
+ ubifs_set_inode_flags(inode);
+ if (S_ISREG(mode))
+ ui->compr_type = c->default_compr;
+ else
+ ui->compr_type = UBIFS_COMPR_NONE;
+ ui->synced_i_size = 0;
+
+ spin_lock(&c->cnt_lock);
+ /* Inode number overflow is currently not supported */
+ if (c->highest_inum >= INUM_WARN_WATERMARK) {
+ if (c->highest_inum >= INUM_WATERMARK) {
+ spin_unlock(&c->cnt_lock);
+ ubifs_err("out of inode numbers");
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(-EINVAL);
+ }
+ ubifs_warn("running out of inode numbers (current %lu, max %d)",
+ c->highest_inum, INUM_WATERMARK);
+ }
+
+ inode->i_ino = ++c->highest_inum;
+ inode->i_generation = ++c->vfs_gen;
+ /*
+ * The creation sequence number remains with this inode for its
+ * lifetime. All nodes for this inode have a greater sequence number,
+ * and so it is possible to distinguish obsolete nodes belonging to a
+ * previous incarnation of the same inode number - for example, for the
+ * purpose of rebuilding the index.
+ */
+ ui->creat_sqnum = ++c->max_sqnum;
+ spin_unlock(&c->cnt_lock);
+ return inode;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
+{
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+ if (le16_to_cpu(dent->nlen) != nm->len)
+ return -EINVAL;
+ if (memcmp(dent->name, nm->name, nm->len))
+ return -EINVAL;
+ return 0;
+}
+
+#else
+
+#define dbg_check_name(dent, nm) 0
+
+#endif
+
+static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ int err;
+ union ubifs_key key;
+ struct inode *inode = NULL;
+ struct ubifs_dent_node *dent;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+ dbg_gen("'%.*s' in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+ if (dentry->d_name.len > UBIFS_MAX_NLEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+ if (!dent)
+ return ERR_PTR(-ENOMEM);
+
+ dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+
+ err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
+ if (err) {
+ /*
+ * Do not hash the direntry if parent 'i_nlink' is zero, because
+ * this has side-effects - '->delete_inode()' call will not be
+ * called for the parent orphan inode, because 'd_count' of its
+ * direntry will stay 1 (it'll be negative direntry I guess)
+ * and prevent 'iput_final()' until the dentry is destroyed due
+ * to unmount or memory pressure.
+ */
+ if (err == -ENOENT && dir->i_nlink != 0) {
+ dbg_gen("not found");
+ goto done;
+ }
+ goto out;
+ }
+
+ if (dbg_check_name(dent, &dentry->d_name)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+ if (IS_ERR(inode)) {
+ /*
+ * This should not happen. Probably the file-system needs
+ * checking.
+ */
+ err = PTR_ERR(inode);
+ ubifs_err("dead directory entry '%.*s', error %d",
+ dentry->d_name.len, dentry->d_name.name, err);
+ ubifs_ro_mode(c, err);
+ goto out;
+ }
+
+done:
+ kfree(dent);
+ /*
+ * Note, d_splice_alias() would be required instead if we supported
+ * NFS.
+ */
+ d_add(dentry, inode);
+ return NULL;
+
+out:
+ kfree(dent);
+ return ERR_PTR(err);
+}
+
+static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1 };
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+
+ /*
+ * Budget request settings: new inode, new direntry, changing the
+ * parent directory inode.
+ */
+
+ dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ mutex_lock(&dir_ui->ui_mutex);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ mutex_unlock(&dir_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ ubifs_err("cannot create regular file, error %d", err);
+ return err;
+}
+
+/**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+ *
+ * This function converts UBIFS directory entry type into VFS directory entry
+ * type.
+ */
+static unsigned int vfs_dent_type(uint8_t type)
+{
+ switch (type) {
+ case UBIFS_ITYPE_REG:
+ return DT_REG;
+ case UBIFS_ITYPE_DIR:
+ return DT_DIR;
+ case UBIFS_ITYPE_LNK:
+ return DT_LNK;
+ case UBIFS_ITYPE_BLK:
+ return DT_BLK;
+ case UBIFS_ITYPE_CHR:
+ return DT_CHR;
+ case UBIFS_ITYPE_FIFO:
+ return DT_FIFO;
+ case UBIFS_ITYPE_SOCK:
+ return DT_SOCK;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+/*
+ * The classical Unix view for directory is that it is a linear array of
+ * (name, inode number) entries. Linux/VFS assumes this model as well.
+ * Particularly, 'readdir()' call wants us to return a directory entry offset
+ * which later may be used to continue 'readdir()'ing the directory or to
+ * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
+ * model because directory entries are identified by keys, which may collide.
+ *
+ * UBIFS uses directory entry hash value for directory offsets, so
+ * 'seekdir()'/'telldir()' may not always work because of possible key
+ * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
+ * properly by means of saving full directory entry name in the private field
+ * of the file description object.
+ *
+ * This means that UBIFS cannot support NFS which requires full
+ * 'seekdir()'/'telldir()' support.
+ */
+static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ int err, over = 0;
+ struct qstr nm;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent;
+ struct inode *dir = file->f_path.dentry->d_inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+ dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+
+ if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+ /*
+ * The directory was seek'ed to a senseless position or there
+ * are no more entries.
+ */
+ return 0;
+
+ /* File positions 0 and 1 correspond to "." and ".." */
+ if (file->f_pos == 0) {
+ ubifs_assert(!file->private_data);
+ over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
+ if (over)
+ return 0;
+ file->f_pos = 1;
+ }
+
+ if (file->f_pos == 1) {
+ ubifs_assert(!file->private_data);
+ over = filldir(dirent, "..", 2, 1,
+ parent_ino(file->f_path.dentry), DT_DIR);
+ if (over)
+ return 0;
+
+ /* Find the first entry in TNC and save it */
+ lowest_dent_key(c, &key, dir->i_ino);
+ nm.name = NULL;
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ goto out;
+ }
+
+ file->f_pos = key_hash_flash(c, &dent->key);
+ file->private_data = dent;
+ }
+
+ dent = file->private_data;
+ if (!dent) {
+ /*
+ * The directory was seek'ed to and is now readdir'ed.
+ * Find the entry corresponding to @file->f_pos or the
+ * closest one.
+ */
+ dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+ nm.name = NULL;
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ goto out;
+ }
+ file->f_pos = key_hash_flash(c, &dent->key);
+ file->private_data = dent;
+ }
+
+ while (1) {
+ dbg_gen("feed '%s', ino %llu, new f_pos %#x",
+ dent->name, le64_to_cpu(dent->inum),
+ key_hash_flash(c, &dent->key));
+ ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
+
+ nm.len = le16_to_cpu(dent->nlen);
+ over = filldir(dirent, dent->name, nm.len, file->f_pos,
+ le64_to_cpu(dent->inum),
+ vfs_dent_type(dent->type));
+ if (over)
+ return 0;
+
+ /* Switch to the next entry */
+ key_read(c, &dent->key, &key);
+ nm.name = dent->name;
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ goto out;
+ }
+
+ kfree(file->private_data);
+ file->f_pos = key_hash_flash(c, &dent->key);
+ file->private_data = dent;
+ cond_resched();
+ }
+
+out:
+ if (err != -ENOENT) {
+ ubifs_err("cannot find next direntry, error %d", err);
+ return err;
+ }
+
+ kfree(file->private_data);
+ file->private_data = NULL;
+ file->f_pos = 2;
+ return 0;
+}
+
+/* If a directory is seeked, we have to free saved readdir() state */
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return generic_file_llseek(file, offset, origin);
+}
+
+/* Free saved readdir() state when the directory is closed */
+static int ubifs_dir_release(struct inode *dir, struct file *file)
+{
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return 0;
+}
+
+/**
+ * lock_2_inodes - lock two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1->i_ino < inode2->i_ino) {
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
+ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
+ } else {
+ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
+ }
+}
+
+/**
+ * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+}
+
+static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct inode *inode = old_dentry->d_inode;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
+ .dirtied_ino_d = ui->data_len };
+
+ /*
+ * Budget request settings: new direntry, changing the target inode,
+ * changing the parent inode.
+ */
+
+ dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+ inode->i_nlink, dir->i_ino);
+ err = dbg_check_synced_i_size(inode);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ lock_2_inodes(dir, inode);
+ inc_nlink(inode);
+ atomic_inc(&inode->i_count);
+ inode->i_ctime = ubifs_current_time(inode);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ unlock_2_inodes(dir, inode);
+
+ ubifs_release_budget(c, &req);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ drop_nlink(inode);
+ unlock_2_inodes(dir, inode);
+ ubifs_release_budget(c, &req);
+ iput(inode);
+ return err;
+}
+
+static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, budgeted = 1;
+ struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+ /*
+ * Budget request settings: deletion direntry, deletion inode (+1 for
+ * @dirtied_ino), changing the parent directory inode. If budgeting
+ * fails, go ahead anyway because we have extra space reserved for
+ * deletions.
+ */
+
+ dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+ inode->i_nlink, dir->i_ino);
+ err = dbg_check_synced_i_size(inode);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ if (err != -ENOSPC)
+ return err;
+ err = 0;
+ budgeted = 0;
+ }
+
+ lock_2_inodes(dir, inode);
+ inode->i_ctime = ubifs_current_time(dir);
+ drop_nlink(inode);
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ if (err)
+ goto out_cancel;
+ unlock_2_inodes(dir, inode);
+
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ else {
+ /* We've deleted something - clean the "no space" flags */
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+ }
+ return 0;
+
+out_cancel:
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ inc_nlink(inode);
+ unlock_2_inodes(dir, inode);
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * check_dir_empty - check if a directory is empty or not.
+ * @c: UBIFS file-system description object
+ * @dir: VFS inode object of the directory to check
+ *
+ * This function checks if directory @dir is empty. Returns zero if the
+ * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
+ * in case of of errors.
+ */
+static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+{
+ struct qstr nm = { .name = NULL };
+ struct ubifs_dent_node *dent;
+ union ubifs_key key;
+ int err;
+
+ lowest_dent_key(c, &key, dir->i_ino);
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -ENOENT)
+ err = 0;
+ } else {
+ kfree(dent);
+ err = -ENOTEMPTY;
+ }
+ return err;
+}
+
+static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct inode *inode = dentry->d_inode;
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, budgeted = 1;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+ /*
+ * Budget request settings: deletion direntry, deletion inode and
+ * changing the parent inode. If budgeting fails, go ahead anyway
+ * because we have extra space reserved for deletions.
+ */
+
+ dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
+ dentry->d_name.name, inode->i_ino, dir->i_ino);
+
+ err = check_dir_empty(c, dentry->d_inode);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ if (err != -ENOSPC)
+ return err;
+ budgeted = 0;
+ }
+
+ lock_2_inodes(dir, inode);
+ inode->i_ctime = ubifs_current_time(dir);
+ clear_nlink(inode);
+ drop_nlink(dir);
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ if (err)
+ goto out_cancel;
+ unlock_2_inodes(dir, inode);
+
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ else {
+ /* We've deleted something - clean the "no space" flags */
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+ }
+ return 0;
+
+out_cancel:
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ inc_nlink(dir);
+ inc_nlink(inode);
+ inc_nlink(inode);
+ unlock_2_inodes(dir, inode);
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct inode *inode;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino_d = 1 };
+
+ /*
+ * Budget request settings: new inode, new direntry and changing parent
+ * directory inode.
+ */
+
+ dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ mutex_lock(&dir_ui->ui_mutex);
+ insert_inode_hash(inode);
+ inc_nlink(inode);
+ inc_nlink(dir);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err) {
+ ubifs_err("cannot create directory, error %d", err);
+ goto out_cancel;
+ }
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ drop_nlink(dir);
+ mutex_unlock(&dir_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ union ubifs_dev_desc *dev = NULL;
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, devlen = 0;
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .new_ino_d = devlen, .dirtied_ino = 1 };
+
+ /*
+ * Budget request settings: new inode, new direntry and changing parent
+ * directory inode.
+ */
+
+ dbg_gen("dent '%.*s' in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ if (S_ISBLK(mode) || S_ISCHR(mode)) {
+ dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!dev)
+ return -ENOMEM;
+ devlen = ubifs_encode_dev(dev, rdev);
+ }
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ kfree(dev);
+ return err;
+ }
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ kfree(dev);
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ init_special_inode(inode, inode->i_mode, rdev);
+ inode->i_size = ubifs_inode(inode)->ui_size = devlen;
+ ui = ubifs_inode(inode);
+ ui->data = dev;
+ ui->data_len = devlen;
+
+ mutex_lock(&dir_ui->ui_mutex);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ mutex_unlock(&dir_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, len = strlen(symname);
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .new_ino_d = len, .dirtied_ino = 1 };
+
+ /*
+ * Budget request settings: new inode, new direntry and changing parent
+ * directory inode.
+ */
+
+ dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
+ dentry->d_name.name, symname, dir->i_ino);
+
+ if (len > UBIFS_MAX_INO_DATA)
+ return -ENAMETOOLONG;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ ui = ubifs_inode(inode);
+ ui->data = kmalloc(len + 1, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_inode;
+ }
+
+ memcpy(ui->data, symname, len);
+ ((char *)ui->data)[len] = '\0';
+ /*
+ * The terminating zero byte is not written to the flash media and it
+ * is put just to make later in-memory string processing simpler. Thus,
+ * data length is @len, not @len + %1.
+ */
+ ui->data_len = len;
+ inode->i_size = ubifs_inode(inode)->ui_size = len;
+
+ mutex_lock(&dir_ui->ui_mutex);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * lock_3_inodes - lock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ *
+ * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
+ * be null.
+ */
+static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
+ struct inode *inode3)
+{
+ struct inode *i1, *i2, *i3;
+
+ if (!inode3) {
+ if (inode1 != inode2) {
+ lock_2_inodes(inode1, inode2);
+ return;
+ }
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+ return;
+ }
+
+ if (inode1 == inode2) {
+ lock_2_inodes(inode1, inode3);
+ return;
+ }
+
+ /* 3 different inodes */
+ if (inode1 < inode2) {
+ i3 = inode2;
+ if (inode1 < inode3) {
+ i1 = inode1;
+ i2 = inode3;
+ } else {
+ i1 = inode3;
+ i2 = inode1;
+ }
+ } else {
+ i3 = inode1;
+ if (inode2 < inode3) {
+ i1 = inode2;
+ i2 = inode3;
+ } else {
+ i1 = inode3;
+ i2 = inode2;
+ }
+ }
+ mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
+ lock_2_inodes(i2, i3);
+}
+
+/**
+ * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ */
+static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
+ struct inode *inode3)
+{
+ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+ if (inode1 != inode2)
+ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+ if (inode3)
+ mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+}
+
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+ int err, release, sync = 0, move = (new_dir != old_dir);
+ int is_dir = S_ISDIR(old_inode->i_mode);
+ int unlink = !!new_inode;
+ int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
+ int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+ .dirtied_ino = 3 };
+ struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+ .dirtied_ino_d = old_inode_ui->data_len };
+ struct timespec time;
+
+ /*
+ * Budget request settings: deletion direntry, new direntry, removing
+ * the old inode, and changing old and new parent directory inodes.
+ *
+ * However, this operation also marks the target inode as dirty and
+ * does not write it, so we allocate budget for the target inode
+ * separately.
+ */
+
+ dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
+ "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+ old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
+ new_dentry->d_name.name, new_dir->i_ino);
+
+ if (unlink && is_dir) {
+ err = check_dir_empty(c, new_inode);
+ if (err)
+ return err;
+ }
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+ err = ubifs_budget_space(c, &ino_req);
+ if (err) {
+ ubifs_release_budget(c, &req);
+ return err;
+ }
+
+ lock_3_inodes(old_dir, new_dir, new_inode);
+
+ /*
+ * Like most other Unix systems, set the @i_ctime for inodes on a
+ * rename.
+ */
+ time = ubifs_current_time(old_dir);
+ old_inode->i_ctime = time;
+
+ /* We must adjust parent link count when renaming directories */
+ if (is_dir) {
+ if (move) {
+ /*
+ * @old_dir loses a link because we are moving
+ * @old_inode to a different directory.
+ */
+ drop_nlink(old_dir);
+ /*
+ * @new_dir only gains a link if we are not also
+ * overwriting an existing directory.
+ */
+ if (!unlink)
+ inc_nlink(new_dir);
+ } else {
+ /*
+ * @old_inode is not moving to a different directory,
+ * but @old_dir still loses a link if we are
+ * overwriting an existing directory.
+ */
+ if (unlink)
+ drop_nlink(old_dir);
+ }
+ }
+
+ old_dir->i_size -= old_sz;
+ ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+ old_dir->i_mtime = old_dir->i_ctime = time;
+ new_dir->i_mtime = new_dir->i_ctime = time;
+
+ /*
+ * And finally, if we unlinked a direntry which happened to have the
+ * same name as the moved direntry, we have to decrement @i_nlink of
+ * the unlinked inode and change its ctime.
+ */
+ if (unlink) {
+ /*
+ * Directories cannot have hard-links, so if this is a
+ * directory, decrement its @i_nlink twice because an empty
+ * directory has @i_nlink 2.
+ */
+ if (is_dir)
+ drop_nlink(new_inode);
+ new_inode->i_ctime = time;
+ drop_nlink(new_inode);
+ } else {
+ new_dir->i_size += new_sz;
+ ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+ }
+
+ /*
+ * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
+ * is dirty, because this will be done later on at the end of
+ * 'ubifs_rename()'.
+ */
+ if (IS_SYNC(old_inode)) {
+ sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+ if (unlink && IS_SYNC(new_inode))
+ sync = 1;
+ }
+ err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+ sync);
+ if (err)
+ goto out_cancel;
+
+ unlock_3_inodes(old_dir, new_dir, new_inode);
+ ubifs_release_budget(c, &req);
+
+ mutex_lock(&old_inode_ui->ui_mutex);
+ release = old_inode_ui->dirty;
+ mark_inode_dirty_sync(old_inode);
+ mutex_unlock(&old_inode_ui->ui_mutex);
+
+ if (release)
+ ubifs_release_budget(c, &ino_req);
+ if (IS_SYNC(old_inode))
+ err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+ return err;
+
+out_cancel:
+ if (unlink) {
+ if (is_dir)
+ inc_nlink(new_inode);
+ inc_nlink(new_inode);
+ } else {
+ new_dir->i_size -= new_sz;
+ ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+ }
+ old_dir->i_size += old_sz;
+ ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+ if (is_dir) {
+ if (move) {
+ inc_nlink(old_dir);
+ if (!unlink)
+ drop_nlink(new_dir);
+ } else {
+ if (unlink)
+ inc_nlink(old_dir);
+ }
+ }
+ unlock_3_inodes(old_dir, new_dir, new_inode);
+ ubifs_release_budget(c, &ino_req);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ loff_t size;
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ mutex_lock(&ui->ui_mutex);
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = inode->i_ino;
+ stat->mode = inode->i_mode;
+ stat->nlink = inode->i_nlink;
+ stat->uid = inode->i_uid;
+ stat->gid = inode->i_gid;
+ stat->rdev = inode->i_rdev;
+ stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
+ stat->blksize = UBIFS_BLOCK_SIZE;
+ stat->size = ui->ui_size;
+
+ /*
+ * Unfortunately, the 'stat()' system call was designed for block
+ * device based file systems, and it is not appropriate for UBIFS,
+ * because UBIFS does not have notion of "block". For example, it is
+ * difficult to tell how many block a directory takes - it actually
+ * takes less than 300 bytes, but we have to round it to block size,
+ * which introduces large mistake. This makes utilities like 'du' to
+ * report completely senseless numbers. This is the reason why UBIFS
+ * goes the same way as JFFS2 - it reports zero blocks for everything
+ * but regular files, which makes more sense than reporting completely
+ * wrong sizes.
+ */
+ if (S_ISREG(inode->i_mode)) {
+ size = ui->xattr_size;
+ size += stat->size;
+ size = ALIGN(size, UBIFS_BLOCK_SIZE);
+ /*
+ * Note, user-space expects 512-byte blocks count irrespectively
+ * of what was reported in @stat->size.
+ */
+ stat->blocks = size >> 9;
+ } else
+ stat->blocks = 0;
+ mutex_unlock(&ui->ui_mutex);
+ return 0;
+}
+
+struct inode_operations ubifs_dir_inode_operations = {
+ .lookup = ubifs_lookup,
+ .create = ubifs_create,
+ .link = ubifs_link,
+ .symlink = ubifs_symlink,
+ .unlink = ubifs_unlink,
+ .mkdir = ubifs_mkdir,
+ .rmdir = ubifs_rmdir,
+ .mknod = ubifs_mknod,
+ .rename = ubifs_rename,
+ .setattr = ubifs_setattr,
+ .getattr = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+ .setxattr = ubifs_setxattr,
+ .getxattr = ubifs_getxattr,
+ .listxattr = ubifs_listxattr,
+ .removexattr = ubifs_removexattr,
+#endif
+};
+
+struct file_operations ubifs_dir_operations = {
+ .llseek = ubifs_dir_llseek,
+ .release = ubifs_dir_release,
+ .read = generic_read_dir,
+ .readdir = ubifs_readdir,
+ .fsync = ubifs_fsync,
+ .unlocked_ioctl = ubifs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 00000000000..005a3b854d9
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1275 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements VFS file and inode operations of regular files, device
+ * nodes and symlinks as well as address space operations.
+ *
+ * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
+ * page is dirty and is used for budgeting purposes - dirty pages should not be
+ * budgeted. The PG_checked flag is set if full budgeting is required for the
+ * page e.g., when it corresponds to a file hole or it is just beyond the file
+ * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
+ * fail in this function, and the budget is released in 'ubifs_write_end()'. So
+ * the PG_private and PG_checked flags carry the information about how the page
+ * was budgeted, to make it possible to release the budget properly.
+ *
+ * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
+ * we implement. However, this is not true for '->writepage()', which might be
+ * called with 'i_mutex' unlocked. For example, when pdflush is performing
+ * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
+ * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
+ * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
+ * path'. So, in '->writepage()' we are only guaranteed that the page is
+ * locked.
+ *
+ * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
+ * readahead path does not have it locked ("sys_read -> generic_file_aio_read
+ * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
+ * not set as well. However, UBIFS disables readahead.
+ *
+ * This, for example means that there might be 2 concurrent '->writepage()'
+ * calls for the same inode, but different inode dirty pages.
+ */
+
+#include "ubifs.h"
+#include <linux/mount.h>
+
+static int read_block(struct inode *inode, void *addr, unsigned int block,
+ struct ubifs_data_node *dn)
+{
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ int err, len, out_len;
+ union ubifs_key key;
+ unsigned int dlen;
+
+ data_key_init(c, &key, inode->i_ino, block);
+ err = ubifs_tnc_lookup(c, &key, dn);
+ if (err) {
+ if (err == -ENOENT)
+ /* Not found, so it must be a hole */
+ memset(addr, 0, UBIFS_BLOCK_SIZE);
+ return err;
+ }
+
+ ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
+
+ len = le32_to_cpu(dn->size);
+ if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+ goto dump;
+
+ dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ out_len = UBIFS_BLOCK_SIZE;
+ err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+ le16_to_cpu(dn->compr_type));
+ if (err || len != out_len)
+ goto dump;
+
+ /*
+ * Data length can be less than a full block, even for blocks that are
+ * not the last in the file (e.g., as a result of making a hole and
+ * appending data). Ensure that the remainder is zeroed out.
+ */
+ if (len < UBIFS_BLOCK_SIZE)
+ memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+ return 0;
+
+dump:
+ ubifs_err("bad data node (block %u, inode %lu)",
+ block, inode->i_ino);
+ dbg_dump_node(c, dn);
+ return -EINVAL;
+}
+
+static int do_readpage(struct page *page)
+{
+ void *addr;
+ int err = 0, i;
+ unsigned int block, beyond;
+ struct ubifs_data_node *dn;
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+
+ dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+ inode->i_ino, page->index, i_size, page->flags);
+ ubifs_assert(!PageChecked(page));
+ ubifs_assert(!PagePrivate(page));
+
+ addr = kmap(page);
+
+ block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+ if (block >= beyond) {
+ /* Reading beyond inode */
+ SetPageChecked(page);
+ memset(addr, 0, PAGE_CACHE_SIZE);
+ goto out;
+ }
+
+ dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
+ if (!dn) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ while (1) {
+ int ret;
+
+ if (block >= beyond) {
+ /* Reading beyond inode */
+ err = -ENOENT;
+ memset(addr, 0, UBIFS_BLOCK_SIZE);
+ } else {
+ ret = read_block(inode, addr, block, dn);
+ if (ret) {
+ err = ret;
+ if (err != -ENOENT)
+ break;
+ }
+ }
+ if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ break;
+ block += 1;
+ addr += UBIFS_BLOCK_SIZE;
+ }
+ if (err) {
+ if (err == -ENOENT) {
+ /* Not found, so it must be a hole */
+ SetPageChecked(page);
+ dbg_gen("hole");
+ goto out_free;
+ }
+ ubifs_err("cannot read page %lu of inode %lu, error %d",
+ page->index, inode->i_ino, err);
+ goto error;
+ }
+
+out_free:
+ kfree(dn);
+out:
+ SetPageUptodate(page);
+ ClearPageError(page);
+ flush_dcache_page(page);
+ kunmap(page);
+ return 0;
+
+error:
+ kfree(dn);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ flush_dcache_page(page);
+ kunmap(page);
+ return err;
+}
+
+/**
+ * release_new_page_budget - release budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of one new page of data.
+ */
+static void release_new_page_budget(struct ubifs_info *c)
+{
+ struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
+
+ ubifs_release_budget(c, &req);
+}
+
+/**
+ * release_existing_page_budget - release budget of an existing page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of changing one one page of data which already exists on the flash media.
+ */
+static void release_existing_page_budget(struct ubifs_info *c)
+{
+ struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+
+ ubifs_release_budget(c, &req);
+}
+
+static int write_begin_slow(struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep)
+{
+ struct inode *inode = mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct ubifs_budget_req req = { .new_page = 1 };
+ int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ struct page *page;
+
+ dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
+ inode->i_ino, pos, len, inode->i_size);
+
+ /*
+ * At the slow path we have to budget before locking the page, because
+ * budgeting may force write-back, which would wait on locked pages and
+ * deadlock if we had the page locked. At this point we do not know
+ * anything about the page, so assume that this is a new page which is
+ * written to a hole. This corresponds to largest budget. Later the
+ * budget will be amended if this is not true.
+ */
+ if (appending)
+ /* We are appending data, budget for inode change */
+ req.dirtied_ino = 1;
+
+ err = ubifs_budget_space(c, &req);
+ if (unlikely(err))
+ return err;
+
+ page = __grab_cache_page(mapping, index);
+ if (unlikely(!page)) {
+ ubifs_release_budget(c, &req);
+ return -ENOMEM;
+ }
+
+ if (!PageUptodate(page)) {
+ if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ SetPageChecked(page);
+ else {
+ err = do_readpage(page);
+ if (err) {
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+ }
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+
+ if (PagePrivate(page))
+ /*
+ * The page is dirty, which means it was budgeted twice:
+ * o first time the budget was allocated by the task which
+ * made the page dirty and set the PG_private flag;
+ * o and then we budgeted for it for the second time at the
+ * very beginning of this function.
+ *
+ * So what we have to do is to release the page budget we
+ * allocated.
+ */
+ release_new_page_budget(c);
+ else if (!PageChecked(page))
+ /*
+ * We are changing a page which already exists on the media.
+ * This means that changing the page does not make the amount
+ * of indexing information larger, and this part of the budget
+ * which we have already acquired may be released.
+ */
+ ubifs_convert_page_budget(c);
+
+ if (appending) {
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ /*
+ * 'ubifs_write_end()' is optimized from the fast-path part of
+ * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
+ * if data is appended.
+ */
+ mutex_lock(&ui->ui_mutex);
+ if (ui->dirty)
+ /*
+ * The inode is dirty already, so we may free the
+ * budget we allocated.
+ */
+ ubifs_release_dirty_inode_budget(c, ui);
+ }
+
+ *pagep = page;
+ return 0;
+}
+
+/**
+ * allocate_budget - allocate budget for 'ubifs_write_begin()'.
+ * @c: UBIFS file-system description object
+ * @page: page to allocate budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for 'ubifs_write_begin()' which allocates budget
+ * for the operation. The budget is allocated differently depending on whether
+ * this is appending, whether the page is dirty or not, and so on. This
+ * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
+ * in case of success and %-ENOSPC in case of failure.
+ */
+static int allocate_budget(struct ubifs_info *c, struct page *page,
+ struct ubifs_inode *ui, int appending)
+{
+ struct ubifs_budget_req req = { .fast = 1 };
+
+ if (PagePrivate(page)) {
+ if (!appending)
+ /*
+ * The page is dirty and we are not appending, which
+ * means no budget is needed at all.
+ */
+ return 0;
+
+ mutex_lock(&ui->ui_mutex);
+ if (ui->dirty)
+ /*
+ * The page is dirty and we are appending, so the inode
+ * has to be marked as dirty. However, it is already
+ * dirty, so we do not need any budget. We may return,
+ * but @ui->ui_mutex hast to be left locked because we
+ * should prevent write-back from flushing the inode
+ * and freeing the budget. The lock will be released in
+ * 'ubifs_write_end()'.
+ */
+ return 0;
+
+ /*
+ * The page is dirty, we are appending, the inode is clean, so
+ * we need to budget the inode change.
+ */
+ req.dirtied_ino = 1;
+ } else {
+ if (PageChecked(page))
+ /*
+ * The page corresponds to a hole and does not
+ * exist on the media. So changing it makes
+ * make the amount of indexing information
+ * larger, and we have to budget for a new
+ * page.
+ */
+ req.new_page = 1;
+ else
+ /*
+ * Not a hole, the change will not add any new
+ * indexing information, budget for page
+ * change.
+ */
+ req.dirtied_page = 1;
+
+ if (appending) {
+ mutex_lock(&ui->ui_mutex);
+ if (!ui->dirty)
+ /*
+ * The inode is clean but we will have to mark
+ * it as dirty because we are appending. This
+ * needs a budget.
+ */
+ req.dirtied_ino = 1;
+ }
+ }
+
+ return ubifs_budget_space(c, &req);
+}
+
+/*
+ * This function is called when a page of data is going to be written. Since
+ * the page of data will not necessarily go to the flash straight away, UBIFS
+ * has to reserve space on the media for it, which is done by means of
+ * budgeting.
+ *
+ * This is the hot-path of the file-system and we are trying to optimize it as
+ * much as possible. For this reasons it is split on 2 parts - slow and fast.
+ *
+ * There many budgeting cases:
+ * o a new page is appended - we have to budget for a new page and for
+ * changing the inode; however, if the inode is already dirty, there is
+ * no need to budget for it;
+ * o an existing clean page is changed - we have budget for it; if the page
+ * does not exist on the media (a hole), we have to budget for a new
+ * page; otherwise, we may budget for changing an existing page; the
+ * difference between these cases is that changing an existing page does
+ * not introduce anything new to the FS indexing information, so it does
+ * not grow, and smaller budget is acquired in this case;
+ * o an existing dirty page is changed - no need to budget at all, because
+ * the page budget has been acquired by earlier, when the page has been
+ * marked dirty.
+ *
+ * UBIFS budgeting sub-system may force write-back if it thinks there is no
+ * space to reserve. This imposes some locking restrictions and makes it
+ * impossible to take into account the above cases, and makes it impossible to
+ * optimize budgeting.
+ *
+ * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
+ * there is a plenty of flash space and the budget will be acquired quickly,
+ * without forcing write-back. The slow path does not make this assumption.
+ */
+static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ struct page *page;
+
+
+ ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+
+ if (unlikely(c->ro_media))
+ return -EROFS;
+
+ /* Try out the fast-path part first */
+ page = __grab_cache_page(mapping, index);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ if (!PageUptodate(page)) {
+ /* The page is not loaded from the flash */
+ if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ /*
+ * We change whole page so no need to load it. But we
+ * have to set the @PG_checked flag to make the further
+ * code the page is new. This might be not true, but it
+ * is better to budget more that to read the page from
+ * the media.
+ */
+ SetPageChecked(page);
+ else {
+ err = do_readpage(page);
+ if (err) {
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+ }
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+
+ err = allocate_budget(c, page, ui, appending);
+ if (unlikely(err)) {
+ ubifs_assert(err == -ENOSPC);
+ /*
+ * Budgeting failed which means it would have to force
+ * write-back but didn't, because we set the @fast flag in the
+ * request. Write-back cannot be done now, while we have the
+ * page locked, because it would deadlock. Unlock and free
+ * everything and fall-back to slow-path.
+ */
+ if (appending) {
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+ mutex_unlock(&ui->ui_mutex);
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ return write_begin_slow(mapping, pos, len, pagep);
+ }
+
+ /*
+ * Whee, we aquired budgeting quickly - without involving
+ * garbage-collection, committing or forceing write-back. We return
+ * with @ui->ui_mutex locked if we are appending pages, and unlocked
+ * otherwise. This is an optimization (slightly hacky though).
+ */
+ *pagep = page;
+ return 0;
+
+}
+
+/**
+ * cancel_budget - cancel budget.
+ * @c: UBIFS file-system description object
+ * @page: page to cancel budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for a page write operation. It unlocks the
+ * @ui->ui_mutex in case of appending.
+ */
+static void cancel_budget(struct ubifs_info *c, struct page *page,
+ struct ubifs_inode *ui, int appending)
+{
+ if (appending) {
+ if (!ui->dirty)
+ ubifs_release_dirty_inode_budget(c, ui);
+ mutex_unlock(&ui->ui_mutex);
+ }
+ if (!PagePrivate(page)) {
+ if (PageChecked(page))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+ }
+}
+
+static int ubifs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ loff_t end_pos = pos + len;
+ int appending = !!(end_pos > inode->i_size);
+
+ dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
+ inode->i_ino, pos, page->index, len, copied, inode->i_size);
+
+ if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+ /*
+ * VFS copied less data to the page that it intended and
+ * declared in its '->write_begin()' call via the @len
+ * argument. If the page was not up-to-date, and @len was
+ * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+ * not load it from the media (for optimization reasons). This
+ * means that part of the page contains garbage. So read the
+ * page now.
+ */
+ dbg_gen("copied %d instead of %d, read page and repeat",
+ copied, len);
+ cancel_budget(c, page, ui, appending);
+
+ /*
+ * Return 0 to force VFS to repeat the whole operation, or the
+ * error code if 'do_readpage()' failes.
+ */
+ copied = do_readpage(page);
+ goto out;
+ }
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ atomic_long_inc(&c->dirty_pg_cnt);
+ __set_page_dirty_nobuffers(page);
+ }
+
+ if (appending) {
+ i_size_write(inode, end_pos);
+ ui->ui_size = end_pos;
+ /*
+ * Note, we do not set @I_DIRTY_PAGES (which means that the
+ * inode has dirty pages), this has been done in
+ * '__set_page_dirty_nobuffers()'.
+ */
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+ mutex_unlock(&ui->ui_mutex);
+ }
+
+out:
+ unlock_page(page);
+ page_cache_release(page);
+ return copied;
+}
+
+static int ubifs_readpage(struct file *file, struct page *page)
+{
+ do_readpage(page);
+ unlock_page(page);
+ return 0;
+}
+
+static int do_writepage(struct page *page, int len)
+{
+ int err = 0, i, blen;
+ unsigned int block;
+ void *addr;
+ union ubifs_key key;
+ struct inode *inode = page->mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+#ifdef UBIFS_DEBUG
+ spin_lock(&ui->ui_lock);
+ ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+ spin_unlock(&ui->ui_lock);
+#endif
+
+ /* Update radix tree tags */
+ set_page_writeback(page);
+
+ addr = kmap(page);
+ block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ i = 0;
+ while (len) {
+ blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+ data_key_init(c, &key, inode->i_ino, block);
+ err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+ if (err)
+ break;
+ if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ break;
+ block += 1;
+ addr += blen;
+ len -= blen;
+ }
+ if (err) {
+ SetPageError(page);
+ ubifs_err("cannot write page %lu of inode %lu, error %d",
+ page->index, inode->i_ino, err);
+ ubifs_ro_mode(c, err);
+ }
+
+ ubifs_assert(PagePrivate(page));
+ if (PageChecked(page))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
+ ClearPagePrivate(page);
+ ClearPageChecked(page);
+
+ kunmap(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ return err;
+}
+
+/*
+ * When writing-back dirty inodes, VFS first writes-back pages belonging to the
+ * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
+ * situation when a we have an inode with size 0, then a megabyte of data is
+ * appended to the inode, then write-back starts and flushes some amount of the
+ * dirty pages, the journal becomes full, commit happens and finishes, and then
+ * an unclean reboot happens. When the file system is mounted next time, the
+ * inode size would still be 0, but there would be many pages which are beyond
+ * the inode size, they would be indexed and consume flash space. Because the
+ * journal has been committed, the replay would not be able to detect this
+ * situation and correct the inode size. This means UBIFS would have to scan
+ * whole index and correct all inode sizes, which is long an unacceptable.
+ *
+ * To prevent situations like this, UBIFS writes pages back only if they are
+ * within last synchronized inode size, i.e. the the size which has been
+ * written to the flash media last time. Otherwise, UBIFS forces inode
+ * write-back, thus making sure the on-flash inode contains current inode size,
+ * and then keeps writing pages back.
+ *
+ * Some locking issues explanation. 'ubifs_writepage()' first is called with
+ * the page locked, and it locks @ui_mutex. However, write-back does take inode
+ * @i_mutex, which means other VFS operations may be run on this inode at the
+ * same time. And the problematic one is truncation to smaller size, from where
+ * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * drops the truncated pages. And while dropping the pages, it takes the page
+ * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ *
+ * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
+ * inode size. How do we do this if @inode->i_size may became smaller while we
+ * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
+ * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
+ * internally and updates it under @ui_mutex.
+ *
+ * Q: why we do not worry that if we race with truncation, we may end up with a
+ * situation when the inode is truncated while we are in the middle of
+ * 'do_writepage()', so we do write beyond inode size?
+ * A: If we are in the middle of 'do_writepage()', truncation would be locked
+ * on the page lock and it would not write the truncated inode node to the
+ * journal before we have finished.
+ */
+static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ loff_t i_size = i_size_read(inode), synced_i_size;
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+ void *kaddr;
+
+ dbg_gen("ino %lu, pg %lu, pg flags %#lx",
+ inode->i_ino, page->index, page->flags);
+ ubifs_assert(PagePrivate(page));
+
+ /* Is the page fully outside @i_size? (truncate in progress) */
+ if (page->index > end_index || (page->index == end_index && !len)) {
+ err = 0;
+ goto out_unlock;
+ }
+
+ spin_lock(&ui->ui_lock);
+ synced_i_size = ui->synced_i_size;
+ spin_unlock(&ui->ui_lock);
+
+ /* Is the page fully inside @i_size? */
+ if (page->index < end_index) {
+ if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ if (err)
+ goto out_unlock;
+ /*
+ * The inode has been written, but the write-buffer has
+ * not been synchronized, so in case of an unclean
+ * reboot we may end up with some pages beyond inode
+ * size, but they would be in the journal (because
+ * commit flushes write buffers) and recovery would deal
+ * with this.
+ */
+ }
+ return do_writepage(page, PAGE_CACHE_SIZE);
+ }
+
+ /*
+ * The page straddles @i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ if (i_size > synced_i_size) {
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ if (err)
+ goto out_unlock;
+ }
+
+ return do_writepage(page, len);
+
+out_unlock:
+ unlock_page(page);
+ return err;
+}
+
+/**
+ * do_attr_changes - change inode attributes.
+ * @inode: inode to change attributes for
+ * @attr: describes attributes to change
+ */
+static void do_attr_changes(struct inode *inode, const struct iattr *attr)
+{
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ if (attr->ia_valid & ATTR_ATIME)
+ inode->i_atime = timespec_trunc(attr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = timespec_trunc(attr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = timespec_trunc(attr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (attr->ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ inode->i_mode = mode;
+ }
+}
+
+/**
+ * do_truncation - truncate an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call when the inode is truncated
+ * to a smaller size. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int do_truncation(struct ubifs_info *c, struct inode *inode,
+ const struct iattr *attr)
+{
+ int err;
+ struct ubifs_budget_req req;
+ loff_t old_size = inode->i_size, new_size = attr->ia_size;
+ int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
+ memset(&req, 0, sizeof(struct ubifs_budget_req));
+
+ /*
+ * If this is truncation to a smaller size, and we do not truncate on a
+ * block boundary, budget for changing one data block, because the last
+ * block will be re-written.
+ */
+ if (new_size & (UBIFS_BLOCK_SIZE - 1))
+ req.dirtied_page = 1;
+
+ req.dirtied_ino = 1;
+ /* A funny way to budget for truncation node */
+ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ err = vmtruncate(inode, new_size);
+ if (err)
+ goto out_budg;
+
+ if (offset) {
+ pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ page = find_lock_page(inode->i_mapping, index);
+ if (page) {
+ if (PageDirty(page)) {
+ /*
+ * 'ubifs_jnl_truncate()' will try to truncate
+ * the last data node, but it contains
+ * out-of-date data because the page is dirty.
+ * Write the page now, so that
+ * 'ubifs_jnl_truncate()' will see an already
+ * truncated (and up to date) data node.
+ */
+ ubifs_assert(PagePrivate(page));
+
+ clear_page_dirty_for_io(page);
+ if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
+ offset = new_size &
+ (PAGE_CACHE_SIZE - 1);
+ err = do_writepage(page, offset);
+ page_cache_release(page);
+ if (err)
+ goto out_budg;
+ /*
+ * We could now tell 'ubifs_jnl_truncate()' not
+ * to read the last block.
+ */
+ } else {
+ /*
+ * We could 'kmap()' the page and pass the data
+ * to 'ubifs_jnl_truncate()' to save it from
+ * having to read it.
+ */
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ }
+
+ mutex_lock(&ui->ui_mutex);
+ ui->ui_size = inode->i_size;
+ /* Truncation changes inode [mc]time */
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ /* The other attributes may be changed at the same time as well */
+ do_attr_changes(inode, attr);
+
+ err = ubifs_jnl_truncate(c, inode, old_size, new_size);
+ mutex_unlock(&ui->ui_mutex);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * do_setattr - change inode attributes.
+ * @c: UBIFS file-system description object
+ * @inode: inode to change attributes for
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call for all cases except
+ * truncations to smaller size. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+static int do_setattr(struct ubifs_info *c, struct inode *inode,
+ const struct iattr *attr)
+{
+ int err, release;
+ loff_t new_size = attr->ia_size;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ui->data_len };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ dbg_gen("size %lld -> %lld", inode->i_size, new_size);
+ err = vmtruncate(inode, new_size);
+ if (err)
+ goto out;
+ }
+
+ mutex_lock(&ui->ui_mutex);
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* Truncation changes inode [mc]time */
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ /* 'vmtruncate()' changed @i_size, update @ui_size */
+ ui->ui_size = inode->i_size;
+ }
+
+ do_attr_changes(inode, attr);
+
+ release = ui->dirty;
+ if (attr->ia_valid & ATTR_SIZE)
+ /*
+ * Inode length changed, so we have to make sure
+ * @I_DIRTY_DATASYNC is set.
+ */
+ __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ else
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+
+ if (release)
+ ubifs_release_budget(c, &req);
+ if (IS_SYNC(inode))
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ return err;
+
+out:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int err;
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ err = dbg_check_synced_i_size(inode);
+ if (err)
+ return err;
+
+ if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
+ /* Truncation to a smaller size */
+ err = do_truncation(c, inode, attr);
+ else
+ err = do_setattr(c, inode, attr);
+
+ return err;
+}
+
+static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ ubifs_assert(PagePrivate(page));
+ if (offset)
+ /* Partial page remains dirty */
+ return;
+
+ if (PageChecked(page))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
+ ClearPagePrivate(page);
+ ClearPageChecked(page);
+}
+
+static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
+
+ nd_set_link(nd, ui->data);
+ return NULL;
+}
+
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ int err;
+
+ dbg_gen("syncing inode %lu", inode->i_ino);
+
+ /*
+ * VFS has already synchronized dirty pages for this inode. Synchronize
+ * the inode unless this is a 'datasync()' call.
+ */
+ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ if (err)
+ return err;
+ }
+
+ /*
+ * Nodes related to this inode may still sit in a write-buffer. Flush
+ * them.
+ */
+ err = ubifs_sync_wbufs_by_inode(c, inode);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * mctime_update_needed - check if mtime or ctime update is needed.
+ * @inode: the inode to do the check for
+ * @now: current time
+ *
+ * This helper function checks if the inode mtime/ctime should be updated or
+ * not. If current values of the time-stamps are within the UBIFS inode time
+ * granularity, they are not updated. This is an optimization.
+ */
+static inline int mctime_update_needed(const struct inode *inode,
+ const struct timespec *now)
+{
+ if (!timespec_equal(&inode->i_mtime, now) ||
+ !timespec_equal(&inode->i_ctime, now))
+ return 1;
+ return 0;
+}
+
+/**
+ * update_ctime - update mtime and ctime of an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to update
+ *
+ * This function updates mtime and ctime of the inode if it is not equivalent to
+ * current time. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int update_mctime(struct ubifs_info *c, struct inode *inode)
+{
+ struct timespec now = ubifs_current_time(inode);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (mctime_update_needed(inode, &now)) {
+ int err, release;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ui->data_len };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&ui->ui_mutex);
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ release = ui->dirty;
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_budget(c, &req);
+ }
+
+ return 0;
+}
+
+static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ int err;
+ ssize_t ret;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ err = update_mctime(c, inode);
+ if (err)
+ return err;
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
+ err = ubifs_sync_wbufs_by_inode(c, inode);
+ if (err)
+ return err;
+ }
+
+ return ret;
+}
+
+static int ubifs_set_page_dirty(struct page *page)
+{
+ int ret;
+
+ ret = __set_page_dirty_nobuffers(page);
+ /*
+ * An attempt to dirty a page without budgeting for it - should not
+ * happen.
+ */
+ ubifs_assert(ret == 0);
+ return ret;
+}
+
+static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+ /*
+ * An attempt to release a dirty page without budgeting for it - should
+ * not happen.
+ */
+ if (PageWriteback(page))
+ return 0;
+ ubifs_assert(PagePrivate(page));
+ ubifs_assert(0);
+ ClearPagePrivate(page);
+ ClearPageChecked(page);
+ return 1;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. UBIFS must ensure page is budgeted for.
+ */
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct timespec now = ubifs_current_time(inode);
+ struct ubifs_budget_req req = { .new_page = 1 };
+ int err, update_time;
+
+ dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
+ i_size_read(inode));
+ ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+
+ if (unlikely(c->ro_media))
+ return -EROFS;
+
+ /*
+ * We have not locked @page so far so we may budget for changing the
+ * page. Note, we cannot do this after we locked the page, because
+ * budgeting may cause write-back which would cause deadlock.
+ *
+ * At the moment we do not know whether the page is dirty or not, so we
+ * assume that it is not and budget for a new page. We could look at
+ * the @PG_private flag and figure this out, but we may race with write
+ * back and the page state may change by the time we lock it, so this
+ * would need additional care. We do not bother with this at the
+ * moment, although it might be good idea to do. Instead, we allocate
+ * budget for a new page and amend it later on if the page was in fact
+ * dirty.
+ *
+ * The budgeting-related logic of this function is similar to what we
+ * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
+ * for more comments.
+ */
+ update_time = mctime_update_needed(inode, &now);
+ if (update_time)
+ /*
+ * We have to change inode time stamp which requires extra
+ * budgeting.
+ */
+ req.dirtied_ino = 1;
+
+ err = ubifs_budget_space(c, &req);
+ if (unlikely(err)) {
+ if (err == -ENOSPC)
+ ubifs_warn("out of space for mmapped file "
+ "(inode number %lu)", inode->i_ino);
+ return err;
+ }
+
+ lock_page(page);
+ if (unlikely(page->mapping != inode->i_mapping ||
+ page_offset(page) > i_size_read(inode))) {
+ /* Page got truncated out from underneath us */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (PagePrivate(page))
+ release_new_page_budget(c);
+ else {
+ if (!PageChecked(page))
+ ubifs_convert_page_budget(c);
+ SetPagePrivate(page);
+ atomic_long_inc(&c->dirty_pg_cnt);
+ __set_page_dirty_nobuffers(page);
+ }
+
+ if (update_time) {
+ int release;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ mutex_lock(&ui->ui_mutex);
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ release = ui->dirty;
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_dirty_inode_budget(c, ui);
+ }
+
+ unlock_page(page);
+ return 0;
+
+out_unlock:
+ unlock_page(page);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static struct vm_operations_struct ubifs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ubifs_vm_page_mkwrite,
+};
+
+static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int err;
+
+ /* 'generic_file_mmap()' takes care of NOMMU case */
+ err = generic_file_mmap(file, vma);
+ if (err)
+ return err;
+ vma->vm_ops = &ubifs_file_vm_ops;
+ return 0;
+}
+
+struct address_space_operations ubifs_file_address_operations = {
+ .readpage = ubifs_readpage,
+ .writepage = ubifs_writepage,
+ .write_begin = ubifs_write_begin,
+ .write_end = ubifs_write_end,
+ .invalidatepage = ubifs_invalidatepage,
+ .set_page_dirty = ubifs_set_page_dirty,
+ .releasepage = ubifs_releasepage,
+};
+
+struct inode_operations ubifs_file_inode_operations = {
+ .setattr = ubifs_setattr,
+ .getattr = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+ .setxattr = ubifs_setxattr,
+ .getxattr = ubifs_getxattr,
+ .listxattr = ubifs_listxattr,
+ .removexattr = ubifs_removexattr,
+#endif
+};
+
+struct inode_operations ubifs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = ubifs_follow_link,
+ .setattr = ubifs_setattr,
+ .getattr = ubifs_getattr,
+};
+
+struct file_operations ubifs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = ubifs_aio_write,
+ .mmap = ubifs_file_mmap,
+ .fsync = ubifs_fsync,
+ .unlocked_ioctl = ubifs_ioctl,
+ .splice_read = generic_file_splice_read,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 00000000000..10394c54836
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,975 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file contains functions for finding LEBs for various purposes e.g.
+ * garbage collection. In general, lprops category heaps and lists are used
+ * for fast access, falling back on scanning the LPT as a last resort.
+ */
+
+#include <linux/sort.h>
+#include "ubifs.h"
+
+/**
+ * struct scan_data - data provided to scan callback functions
+ * @min_space: minimum number of bytes for which to scan
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @lnum: LEB number found is returned here
+ * @exclude_index: whether to exclude index LEBs
+ */
+struct scan_data {
+ int min_space;
+ int pick_free;
+ int lnum;
+ int exclude_index;
+};
+
+/**
+ * valuable - determine whether LEB properties are valuable.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * This function return %1 if the LEB properties should be added to the LEB
+ * properties tree in memory. Otherwise %0 is returned.
+ */
+static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
+{
+ int n, cat = lprops->flags & LPROPS_CAT_MASK;
+ struct ubifs_lpt_heap *heap;
+
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ heap = &c->lpt_heap[cat - 1];
+ if (heap->cnt < heap->max_cnt)
+ return 1;
+ if (lprops->free + lprops->dirty >= c->dark_wm)
+ return 1;
+ return 0;
+ case LPROPS_EMPTY:
+ n = c->lst.empty_lebs + c->freeable_cnt -
+ c->lst.taken_empty_lebs;
+ if (n < c->lsave_cnt)
+ return 1;
+ return 0;
+ case LPROPS_FREEABLE:
+ return 1;
+ case LPROPS_FRDI_IDX:
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * scan_for_dirty_cb - dirty space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_dirty_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude LEBs with too little space */
+ if (lprops->free + lprops->dirty < data->min_space)
+ return ret;
+ /* If specified, exclude index LEBs */
+ if (data->exclude_index && lprops->flags & LPROPS_INDEX)
+ return ret;
+ /* If specified, exclude empty or freeable LEBs */
+ if (lprops->free + lprops->dirty == c->leb_size) {
+ if (!data->pick_free)
+ return ret;
+ /* Exclude LEBs with too little dirty space (unless it is empty) */
+ } else if (lprops->dirty < c->dead_wm)
+ return ret;
+ /* Finally we found space */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_dirty - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ * have
+ * @pick_free: if it is OK to return a free or freeable LEB
+ * @exclude_index: whether to exclude index LEBs
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
+ int min_space, int pick_free,
+ int exclude_index)
+{
+ const struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ struct scan_data data;
+ int err, i;
+
+ /* There may be an LEB with enough dirty space on the free heap */
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ if (lprops->free + lprops->dirty < min_space)
+ continue;
+ if (lprops->dirty < c->dead_wm)
+ continue;
+ return lprops;
+ }
+ /*
+ * A LEB may have fallen off of the bottom of the dirty heap, and ended
+ * up as uncategorized even though it has enough dirty space for us now,
+ * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+ * can end up as uncategorized because they are kept on lists not
+ * finite-sized heaps.
+ */
+ list_for_each_entry(lprops, &c->uncat_list, list) {
+ if (lprops->flags & LPROPS_TAKEN)
+ continue;
+ if (lprops->free + lprops->dirty < min_space)
+ continue;
+ if (exclude_index && (lprops->flags & LPROPS_INDEX))
+ continue;
+ if (lprops->dirty < c->dead_wm)
+ continue;
+ return lprops;
+ }
+ /* We have looked everywhere in main memory, now scan the flash */
+ if (c->pnodes_have >= c->pnode_cnt)
+ /* All pnodes are in memory, so skip scan */
+ return ERR_PTR(-ENOSPC);
+ data.min_space = min_space;
+ data.pick_free = pick_free;
+ data.lnum = -1;
+ data.exclude_index = exclude_index;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+ &data);
+ if (err)
+ return ERR_PTR(err);
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free + lprops->dirty >= min_space);
+ ubifs_assert(lprops->dirty >= c->dead_wm ||
+ (pick_free &&
+ lprops->free + lprops->dirty == c->leb_size));
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
+ * @c: the UBIFS file-system description object
+ * @ret_lp: LEB properties are returned here on exit
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ * have
+ * @pick_free: controls whether it is OK to pick empty or index LEBs
+ *
+ * This function tries to find a dirty logical eraseblock which has at least
+ * @min_space free and dirty space. It prefers to take an LEB from the dirty or
+ * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
+ * or do not have an LEB which satisfies the @min_space criteria.
+ *
+ * Note:
+ * o LEBs which have less than dead watermark of dirty space are never picked
+ * by this function;
+ *
+ * Returns zero and the LEB properties of
+ * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
+ * negative error code in case of other failures. The returned LEB is marked as
+ * "taken".
+ *
+ * The additional @pick_free argument controls if this function has to return a
+ * free or freeable LEB if one is present. For example, GC must to set it to %1,
+ * when called from the journal space reservation function, because the
+ * appearance of free space may coincide with the loss of enough dirty space
+ * for GC to succeed anyway.
+ *
+ * In contrast, if the Garbage Collector is called from budgeting, it should
+ * just make free space, not return LEBs which are already free or freeable.
+ *
+ * In addition @pick_free is set to %2 by the recovery process in order to
+ * recover gc_lnum in which case an index LEB must not be returned.
+ */
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+ int min_space, int pick_free)
+{
+ int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
+ const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
+ struct ubifs_lpt_heap *heap, *idx_heap;
+
+ ubifs_get_lprops(c);
+
+ if (pick_free) {
+ int lebs, rsvd_idx_lebs = 0;
+
+ spin_lock(&c->space_lock);
+ lebs = c->lst.empty_lebs;
+ lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
+
+ /*
+ * Note, the index may consume more LEBs than have been reserved
+ * for it. It is OK because it might be consolidated by GC.
+ * But if the index takes fewer LEBs than it is reserved for it,
+ * this function must avoid picking those reserved LEBs.
+ */
+ if (c->min_idx_lebs >= c->lst.idx_lebs) {
+ rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ exclude_index = 1;
+ }
+ spin_unlock(&c->space_lock);
+
+ /* Check if there are enough free LEBs for the index */
+ if (rsvd_idx_lebs < lebs) {
+ /* OK, try to find an empty LEB */
+ lp = ubifs_fast_find_empty(c);
+ if (lp)
+ goto found;
+
+ /* Or a freeable LEB */
+ lp = ubifs_fast_find_freeable(c);
+ if (lp)
+ goto found;
+ } else
+ /*
+ * We cannot pick free/freeable LEBs in the below code.
+ */
+ pick_free = 0;
+ } else {
+ spin_lock(&c->space_lock);
+ exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+ spin_unlock(&c->space_lock);
+ }
+
+ /* Look on the dirty and dirty index heaps */
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+
+ if (idx_heap->cnt && !exclude_index) {
+ idx_lp = idx_heap->arr[0];
+ sum = idx_lp->free + idx_lp->dirty;
+ /*
+ * Since we reserve twice as more space for the index than it
+ * actually takes, it does not make sense to pick indexing LEBs
+ * with less than half LEB of dirty space.
+ */
+ if (sum < min_space || sum < c->half_leb_size)
+ idx_lp = NULL;
+ }
+
+ if (heap->cnt) {
+ lp = heap->arr[0];
+ if (lp->dirty + lp->free < min_space)
+ lp = NULL;
+ }
+
+ /* Pick the LEB with most space */
+ if (idx_lp && lp) {
+ if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
+ lp = idx_lp;
+ } else if (idx_lp && !lp)
+ lp = idx_lp;
+
+ if (lp) {
+ ubifs_assert(lp->dirty >= c->dead_wm);
+ goto found;
+ }
+
+ /* Did not find a dirty LEB on the dirty heaps, have to scan */
+ dbg_find("scanning LPT for a dirty LEB");
+ lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ ubifs_assert(lp->dirty >= c->dead_wm ||
+ (pick_free && lp->free + lp->dirty == c->leb_size));
+
+found:
+ dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+ lp->lnum, lp->free, lp->dirty, lp->flags);
+
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * scan_for_free_cb - free space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_free_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude index LEBs */
+ if (lprops->flags & LPROPS_INDEX)
+ return ret;
+ /* Exclude LEBs with too little space */
+ if (lprops->free < data->min_space)
+ return ret;
+ /* If specified, exclude empty LEBs */
+ if (!data->pick_free && lprops->free == c->leb_size)
+ return ret;
+ /*
+ * LEBs that have only free and dirty space must not be allocated
+ * because they may have been unmapped already or they may have data
+ * that is obsolete only because of nodes that are still sitting in a
+ * wbuf.
+ */
+ if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
+ return ret;
+ /* Finally we found space */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * do_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of free space required
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static
+const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
+ int min_space, int pick_free,
+ int squeeze)
+{
+ const struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ struct scan_data data;
+ int err, i;
+
+ if (squeeze) {
+ lprops = ubifs_fast_find_free(c);
+ if (lprops && lprops->free >= min_space)
+ return lprops;
+ }
+ if (pick_free) {
+ lprops = ubifs_fast_find_empty(c);
+ if (lprops)
+ return lprops;
+ }
+ if (!squeeze) {
+ lprops = ubifs_fast_find_free(c);
+ if (lprops && lprops->free >= min_space)
+ return lprops;
+ }
+ /* There may be an LEB with enough free space on the dirty heap */
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ if (lprops->free >= min_space)
+ return lprops;
+ }
+ /*
+ * A LEB may have fallen off of the bottom of the free heap, and ended
+ * up as uncategorized even though it has enough free space for us now,
+ * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+ * can end up as uncategorized because they are kept on lists not
+ * finite-sized heaps.
+ */
+ list_for_each_entry(lprops, &c->uncat_list, list) {
+ if (lprops->flags & LPROPS_TAKEN)
+ continue;
+ if (lprops->flags & LPROPS_INDEX)
+ continue;
+ if (lprops->free >= min_space)
+ return lprops;
+ }
+ /* We have looked everywhere in main memory, now scan the flash */
+ if (c->pnodes_have >= c->pnode_cnt)
+ /* All pnodes are in memory, so skip scan */
+ return ERR_PTR(-ENOSPC);
+ data.min_space = min_space;
+ data.pick_free = pick_free;
+ data.lnum = -1;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_for_free_cb,
+ &data);
+ if (err)
+ return ERR_PTR(err);
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free >= min_space);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of required free space
+ * @free: contains amount of free space in the LEB on exit
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function looks for an LEB with at least @min_space bytes of free space.
+ * It tries to find an empty LEB if possible. If no empty LEBs are available,
+ * this function searches for a non-empty data LEB. The returned LEB is marked
+ * as "taken".
+ *
+ * This function returns found LEB number in case of success, %-ENOSPC if it
+ * failed to find a LEB with @min_space bytes of free space and other a negative
+ * error codes in case of failure.
+ */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+ int squeeze)
+{
+ const struct ubifs_lprops *lprops;
+ int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
+
+ dbg_find("min_space %d", min_space);
+ ubifs_get_lprops(c);
+
+ /* Check if there are enough empty LEBs for commit */
+ spin_lock(&c->space_lock);
+ if (c->min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ else
+ rsvd_idx_lebs = 0;
+ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+ c->lst.taken_empty_lebs;
+ ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
+ if (rsvd_idx_lebs < lebs)
+ /*
+ * OK to allocate an empty LEB, but we still don't want to go
+ * looking for one if there aren't any.
+ */
+ if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+ pick_free = 1;
+ /*
+ * Because we release the space lock, we must account
+ * for this allocation here. After the LEB properties
+ * flags have been updated, we subtract one. Note, the
+ * result of this is that lprops also decreases
+ * @taken_empty_lebs in 'ubifs_change_lp()', so it is
+ * off by one for a short period of time which may
+ * introduce a small disturbance to budgeting
+ * calculations, but this is harmless because at the
+ * worst case this would make the budgeting subsystem
+ * be more pessimistic than needed.
+ *
+ * Fundamentally, this is about serialization of the
+ * budgeting and lprops subsystems. We could make the
+ * @space_lock a mutex and avoid dropping it before
+ * calling 'ubifs_change_lp()', but mutex is more
+ * heavy-weight, and we want budgeting to be as fast as
+ * possible.
+ */
+ c->lst.taken_empty_lebs += 1;
+ }
+ spin_unlock(&c->space_lock);
+
+ lprops = do_find_free_space(c, min_space, pick_free, squeeze);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ lnum = lprops->lnum;
+ flags = lprops->flags | LPROPS_TAKEN;
+
+ lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ if (pick_free) {
+ spin_lock(&c->space_lock);
+ c->lst.taken_empty_lebs -= 1;
+ spin_unlock(&c->space_lock);
+ }
+
+ *free = lprops->free;
+ ubifs_release_lprops(c);
+
+ if (*free == c->leb_size) {
+ /*
+ * Ensure that empty LEBs have been unmapped. They may not have
+ * been, for example, because of an unclean unmount. Also
+ * LEBs that were freeable LEBs (free + dirty == leb_size) will
+ * not have been unmapped.
+ */
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+
+ dbg_find("found LEB %d, free %d", lnum, *free);
+ ubifs_assert(*free >= min_space);
+ return lnum;
+
+out:
+ if (pick_free) {
+ spin_lock(&c->space_lock);
+ c->lst.taken_empty_lebs -= 1;
+ spin_unlock(&c->space_lock);
+ }
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_idx_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude index LEBS */
+ if (lprops->flags & LPROPS_INDEX)
+ return ret;
+ /* Exclude LEBs that cannot be made empty */
+ if (lprops->free + lprops->dirty != c->leb_size)
+ return ret;
+ /*
+ * We are allocating for the index so it is safe to allocate LEBs with
+ * only free and dirty space, because write buffers are sync'd at commit
+ * start.
+ */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_leb_for_idx - scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ */
+static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct scan_data data;
+ int err;
+
+ data.lnum = -1;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_for_idx_cb,
+ &data);
+ if (err)
+ return ERR_PTR(err);
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_find_free_leb_for_idx - find a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ *
+ * This function looks for a free LEB and returns that LEB number. The returned
+ * LEB is marked as "taken", "index".
+ *
+ * Only empty LEBs are allocated. This is for two reasons. First, the commit
+ * calculates the number of LEBs to allocate based on the assumption that they
+ * will be empty. Secondly, free space at the end of an index LEB is not
+ * guaranteed to be empty because it may have been used by the in-the-gaps
+ * method prior to an unclean unmount.
+ *
+ * If no LEB is found %-ENOSPC is returned. For other failures another negative
+ * error code is returned.
+ */
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lprops;
+ int lnum = -1, err, flags;
+
+ ubifs_get_lprops(c);
+
+ lprops = ubifs_fast_find_empty(c);
+ if (!lprops) {
+ lprops = ubifs_fast_find_freeable(c);
+ if (!lprops) {
+ ubifs_assert(c->freeable_cnt == 0);
+ if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+ lprops = scan_for_leb_for_idx(c);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+ }
+ }
+ }
+
+ if (!lprops) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ lnum = lprops->lnum;
+
+ dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+ lnum, lprops->free, lprops->dirty, lprops->flags);
+
+ flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
+ lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ ubifs_release_lprops(c);
+
+ /*
+ * Ensure that empty LEBs have been unmapped. They may not have been,
+ * for example, because of an unclean unmount. Also LEBs that were
+ * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
+ */
+ err = ubifs_leb_unmap(c, lnum);
+ if (err) {
+ ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_TAKEN | LPROPS_INDEX, 0);
+ return err;
+ }
+
+ return lnum;
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+static int cmp_dirty_idx(const struct ubifs_lprops **a,
+ const struct ubifs_lprops **b)
+{
+ const struct ubifs_lprops *lpa = *a;
+ const struct ubifs_lprops *lpb = *b;
+
+ return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
+}
+
+static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
+ int size)
+{
+ struct ubifs_lprops *t = *a;
+
+ *a = *b;
+ *b = t;
+}
+
+/**
+ * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is called each commit to create an array of LEB numbers of
+ * dirty index LEBs sorted in order of dirty and free space. This is used by
+ * the in-the-gaps method of TNC commit.
+ */
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
+{
+ int i;
+
+ ubifs_get_lprops(c);
+ /* Copy the LPROPS_DIRTY_IDX heap */
+ c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
+ memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
+ sizeof(void *) * c->dirty_idx.cnt);
+ /* Sort it so that the dirtiest is now at the end */
+ sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
+ (int (*)(const void *, const void *))cmp_dirty_idx,
+ (void (*)(void *, void *, int))swap_dirty_idx);
+ dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
+ if (c->dirty_idx.cnt)
+ dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
+ c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
+ c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
+ c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
+ /* Replace the lprops pointers with LEB numbers */
+ for (i = 0; i < c->dirty_idx.cnt; i++)
+ c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
+ ubifs_release_lprops(c);
+ return 0;
+}
+
+/**
+ * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_dirty_idx_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude non-index LEBs */
+ if (!(lprops->flags & LPROPS_INDEX))
+ return ret;
+ /* Exclude LEBs with too little space */
+ if (lprops->free + lprops->dirty < c->min_idx_node_sz)
+ return ret;
+ /* Finally we found space */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * find_dirty_idx_leb - find a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB number upon success and a negative error code upon
+ * failure. In particular, -ENOSPC is returned if a dirty index LEB is not
+ * found.
+ *
+ * Note that this function scans the entire LPT but it is called very rarely.
+ */
+static int find_dirty_idx_leb(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ struct scan_data data;
+ int err, i, ret;
+
+ /* Check all structures in memory first */
+ data.lnum = -1;
+ heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+ if (ret & LPT_SCAN_STOP)
+ goto found;
+ }
+ list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+ ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+ if (ret & LPT_SCAN_STOP)
+ goto found;
+ }
+ list_for_each_entry(lprops, &c->uncat_list, list) {
+ ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+ if (ret & LPT_SCAN_STOP)
+ goto found;
+ }
+ if (c->pnodes_have >= c->pnode_cnt)
+ /* All pnodes are in memory, so skip scan */
+ return -ENOSPC;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+ &data);
+ if (err)
+ return err;
+found:
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return PTR_ERR(lprops);
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert((lprops->flags & LPROPS_INDEX));
+
+ dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
+ lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
+
+ lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
+ lprops->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lprops))
+ return PTR_ERR(lprops);
+
+ return lprops->lnum;
+}
+
+/**
+ * get_idx_gc_leb - try to get a LEB number from trivial GC.
+ * @c: the UBIFS file-system description object
+ */
+static int get_idx_gc_leb(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lp;
+ int err, lnum;
+
+ err = ubifs_get_idx_gc_leb(c);
+ if (err < 0)
+ return err;
+ lnum = err;
+ /*
+ * The LEB was due to be unmapped after the commit but
+ * it is needed now for this commit.
+ */
+ lp = ubifs_lpt_lookup_dirty(c, lnum);
+ if (unlikely(IS_ERR(lp)))
+ return PTR_ERR(lp);
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_INDEX, -1);
+ if (unlikely(IS_ERR(lp)))
+ return PTR_ERR(lp);
+ dbg_find("LEB %d, dirty %d and free %d flags %#x",
+ lp->lnum, lp->dirty, lp->free, lp->flags);
+ return lnum;
+}
+
+/**
+ * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
+ * @c: the UBIFS file-system description object
+ */
+static int find_dirtiest_idx_leb(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lp;
+ int lnum;
+
+ while (1) {
+ if (!c->dirty_idx.cnt)
+ return -ENOSPC;
+ /* The lprops pointers were replaced by LEB numbers */
+ lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
+ lp = ubifs_lpt_lookup(c, lnum);
+ if (IS_ERR(lp))
+ return PTR_ERR(lp);
+ if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
+ continue;
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp))
+ return PTR_ERR(lp);
+ break;
+ }
+ dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
+ lp->free, lp->flags);
+ ubifs_assert(lp->flags | LPROPS_TAKEN);
+ ubifs_assert(lp->flags | LPROPS_INDEX);
+ return lnum;
+}
+
+/**
+ * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
+ * @c: the UBIFS file-system description object
+ *
+ * This function attempts to find an untaken index LEB with the most free and
+ * dirty space that can be used without overwriting index nodes that were in the
+ * last index committed.
+ */
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
+{
+ int err;
+
+ ubifs_get_lprops(c);
+
+ /*
+ * We made an array of the dirtiest index LEB numbers as at the start of
+ * last commit. Try that array first.
+ */
+ err = find_dirtiest_idx_leb(c);
+
+ /* Next try scanning the entire LPT */
+ if (err == -ENOSPC)
+ err = find_dirty_idx_leb(c);
+
+ /* Finally take any index LEBs awaiting trivial GC */
+ if (err == -ENOSPC)
+ err = get_idx_gc_leb(c);
+
+ ubifs_release_lprops(c);
+ return err;
+}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 00000000000..d0f3dac2908
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,773 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements garbage collection. The procedure for garbage collection
+ * is different depending on whether a LEB as an index LEB (contains index
+ * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
+ * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
+ * nodes to the journal, at which point the garbage-collected LEB is free to be
+ * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
+ * dirty in the TNC, and after the next commit, the garbage-collected LEB is
+ * to be reused. Garbage collection will cause the number of dirty index nodes
+ * to grow, however sufficient space is reserved for the index to ensure the
+ * commit will never run out of space.
+ */
+
+#include <linux/pagemap.h>
+#include "ubifs.h"
+
+/*
+ * GC tries to optimize the way it fit nodes to available space, and it sorts
+ * nodes a little. The below constants are watermarks which define "large",
+ * "medium", and "small" nodes.
+ */
+#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
+#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
+
+/*
+ * GC may need to move more then one LEB to make progress. The below constants
+ * define "soft" and "hard" limits on the number of LEBs the garbage collector
+ * may move.
+ */
+#define SOFT_LEBS_LIMIT 4
+#define HARD_LEBS_LIMIT 32
+
+/**
+ * switch_gc_head - switch the garbage collection journal head.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to write
+ * @len: length of the buffer to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function switch the GC head to the next LEB which is reserved in
+ * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
+ * and other negative error code in case of failures.
+ */
+static int switch_gc_head(struct ubifs_info *c)
+{
+ int err, gc_lnum = c->gc_lnum;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+ ubifs_assert(gc_lnum != -1);
+ dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
+ wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
+ c->leb_size - wbuf->offs - wbuf->used);
+
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ return err;
+
+ /*
+ * The GC write-buffer was synchronized, we may safely unmap
+ * 'c->gc_lnum'.
+ */
+ err = ubifs_leb_unmap(c, gc_lnum);
+ if (err)
+ return err;
+
+ err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
+ if (err)
+ return err;
+
+ c->gc_lnum = -1;
+ err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+ return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes nodes to move
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. The obsolete nodes are dropped.
+ *
+ * When moving nodes we have to deal with classical bin-packing problem: the
+ * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ * where the nodes in the @sleb->nodes list are the elements which should be
+ * fit optimally to the bins. This function uses the "first fit decreasing"
+ * strategy, although it does not really sort the nodes but just split them on
+ * 3 classes - large, medium, and small, so they are roughly sorted.
+ *
+ * This function returns zero in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of other failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+ struct ubifs_scan_node *snod, *tmp;
+ struct list_head large, medium, small;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+ int avail, err, min = INT_MAX;
+
+ INIT_LIST_HEAD(&large);
+ INIT_LIST_HEAD(&medium);
+ INIT_LIST_HEAD(&small);
+
+ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+ struct list_head *lst;
+
+ ubifs_assert(snod->type != UBIFS_IDX_NODE);
+ ubifs_assert(snod->type != UBIFS_REF_NODE);
+ ubifs_assert(snod->type != UBIFS_CS_NODE);
+
+ err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
+ snod->offs, 0);
+ if (err < 0)
+ goto out;
+
+ lst = &snod->list;
+ list_del(lst);
+ if (!err) {
+ /* The node is obsolete, remove it from the list */
+ kfree(snod);
+ continue;
+ }
+
+ /*
+ * Sort the list of nodes so that large nodes go first, and
+ * small nodes go last.
+ */
+ if (snod->len > MEDIUM_NODE_WM)
+ list_add(lst, &large);
+ else if (snod->len > SMALL_NODE_WM)
+ list_add(lst, &medium);
+ else
+ list_add(lst, &small);
+
+ /* And find the smallest node */
+ if (snod->len < min)
+ min = snod->len;
+ }
+
+ /*
+ * Join the tree lists so that we'd have one roughly sorted list
+ * ('large' will be the head of the joined list).
+ */
+ list_splice(&medium, large.prev);
+ list_splice(&small, large.prev);
+
+ if (wbuf->lnum == -1) {
+ /*
+ * The GC journal head is not set, because it is the first GC
+ * invocation since mount.
+ */
+ err = switch_gc_head(c);
+ if (err)
+ goto out;
+ }
+
+ /* Write nodes to their new location. Use the first-fit strategy */
+ while (1) {
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ list_for_each_entry_safe(snod, tmp, &large, list) {
+ int new_lnum, new_offs;
+
+ if (avail < min)
+ break;
+
+ if (snod->len > avail)
+ /* This node does not fit */
+ continue;
+
+ cond_resched();
+
+ new_lnum = wbuf->lnum;
+ new_offs = wbuf->offs + wbuf->used;
+ err = ubifs_wbuf_write_nolock(wbuf, snod->node,
+ snod->len);
+ if (err)
+ goto out;
+ err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+ snod->offs, new_lnum, new_offs,
+ snod->len);
+ if (err)
+ goto out;
+
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ list_del(&snod->list);
+ kfree(snod);
+ }
+
+ if (list_empty(&large))
+ break;
+
+ /*
+ * Waste the rest of the space in the LEB and switch to the
+ * next LEB.
+ */
+ err = switch_gc_head(c);
+ if (err)
+ goto out;
+ }
+
+ return 0;
+
+out:
+ list_for_each_entry_safe(snod, tmp, &large, list) {
+ list_del(&snod->list);
+ kfree(snod);
+ }
+ return err;
+}
+
+/**
+ * gc_sync_wbufs - sync write-buffers for GC.
+ * @c: UBIFS file-system description object
+ *
+ * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
+ * be in a write-buffer instead. That is, a node could be written to a
+ * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
+ * erased before the write-buffer is sync'd and then there is an unclean
+ * unmount, then an existing node is lost. To avoid this, we sync all
+ * write-buffers.
+ *
+ * This function returns %0 on success or a negative error code on failure.
+ */
+static int gc_sync_wbufs(struct ubifs_info *c)
+{
+ int err, i;
+
+ for (i = 0; i < c->jhead_cnt; i++) {
+ if (i == GCHD)
+ continue;
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/**
+ * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lp: describes the LEB to garbage collect
+ *
+ * This function garbage-collects an LEB and returns one of the @LEB_FREED,
+ * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of failures.
+ */
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+ int err = 0, lnum = lp->lnum;
+
+ ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
+ c->need_recovery);
+ ubifs_assert(c->gc_lnum != lnum);
+ ubifs_assert(wbuf->lnum != lnum);
+
+ /*
+ * We scan the entire LEB even though we only really need to scan up to
+ * (c->leb_size - lp->free).
+ */
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+
+ ubifs_assert(!list_empty(&sleb->nodes));
+ snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+
+ if (snod->type == UBIFS_IDX_NODE) {
+ struct ubifs_gced_idx_leb *idx_gc;
+
+ dbg_gc("indexing LEB %d (free %d, dirty %d)",
+ lnum, lp->free, lp->dirty);
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ struct ubifs_idx_node *idx = snod->node;
+ int level = le16_to_cpu(idx->level);
+
+ ubifs_assert(snod->type == UBIFS_IDX_NODE);
+ key_read(c, ubifs_idx_key(c, idx), &snod->key);
+ err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
+ snod->offs);
+ if (err)
+ goto out;
+ }
+
+ idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+ if (!idx_gc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ idx_gc->lnum = lnum;
+ idx_gc->unmap = 0;
+ list_add(&idx_gc->list, &c->idx_gc);
+
+ /*
+ * Don't release the LEB until after the next commit, because
+ * it may contain date which is needed for recovery. So
+ * although we freed this LEB, it will become usable only after
+ * the commit.
+ */
+ err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
+ LPROPS_INDEX, 1);
+ if (err)
+ goto out;
+ err = LEB_FREED_IDX;
+ } else {
+ dbg_gc("data LEB %d (free %d, dirty %d)",
+ lnum, lp->free, lp->dirty);
+
+ err = move_nodes(c, sleb);
+ if (err)
+ goto out;
+
+ err = gc_sync_wbufs(c);
+ if (err)
+ goto out;
+
+ err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
+ if (err)
+ goto out;
+
+ if (c->gc_lnum == -1) {
+ c->gc_lnum = lnum;
+ err = LEB_RETAINED;
+ } else {
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ goto out;
+
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ goto out;
+
+ err = LEB_FREED;
+ }
+ }
+
+out:
+ ubifs_scan_destroy(sleb);
+ return err;
+}
+
+/**
+ * ubifs_garbage_collect - UBIFS garbage collector.
+ * @c: UBIFS file-system description object
+ * @anyway: do GC even if there are free LEBs
+ *
+ * This function does out-of-place garbage collection. The return codes are:
+ * o positive LEB number if the LEB has been freed and may be used;
+ * o %-EAGAIN if the caller has to run commit;
+ * o %-ENOSPC if GC failed to make any progress;
+ * o other negative error codes in case of other errors.
+ *
+ * Garbage collector writes data to the journal when GC'ing data LEBs, and just
+ * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
+ * commit may be required. But commit cannot be run from inside GC, because the
+ * caller might be holding the commit lock, so %-EAGAIN is returned instead;
+ * And this error code means that the caller has to run commit, and re-run GC
+ * if there is still no free space.
+ *
+ * There are many reasons why this function may return %-EAGAIN:
+ * o the log is full and there is no space to write an LEB reference for
+ * @c->gc_lnum;
+ * o the journal is too large and exceeds size limitations;
+ * o GC moved indexing LEBs, but they can be used only after the commit;
+ * o the shrinker fails to find clean znodes to free and requests the commit;
+ * o etc.
+ *
+ * Note, if the file-system is close to be full, this function may return
+ * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
+ * the function. E.g., this happens if the limits on the journal size are too
+ * tough and GC writes too much to the journal before an LEB is freed. This
+ * might also mean that the journal is too large, and the TNC becomes to big,
+ * so that the shrinker is constantly called, finds not clean znodes to free,
+ * and requests commit. Well, this may also happen if the journal is all right,
+ * but another kernel process consumes too much memory. Anyway, infinite
+ * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
+ */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
+{
+ int i, err, ret, min_space = c->dead_wm;
+ struct ubifs_lprops lp;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+ ubifs_assert_cmt_locked(c);
+
+ if (ubifs_gc_should_commit(c))
+ return -EAGAIN;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+ if (c->ro_media) {
+ ret = -EROFS;
+ goto out_unlock;
+ }
+
+ /* We expect the write-buffer to be empty on entry */
+ ubifs_assert(!wbuf->used);
+
+ for (i = 0; ; i++) {
+ int space_before = c->leb_size - wbuf->offs - wbuf->used;
+ int space_after;
+
+ cond_resched();
+
+ /* Give the commit an opportunity to run */
+ if (ubifs_gc_should_commit(c)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
+ /*
+ * We've done enough iterations. Indexing LEBs were
+ * moved and will be available after the commit.
+ */
+ dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
+ ubifs_commit_required(c);
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (i > HARD_LEBS_LIMIT) {
+ /*
+ * We've moved too many LEBs and have not made
+ * progress, give up.
+ */
+ dbg_gc("hard limit, -ENOSPC");
+ ret = -ENOSPC;
+ break;
+ }
+
+ /*
+ * Empty and freeable LEBs can turn up while we waited for
+ * the wbuf lock, or while we have been running GC. In that
+ * case, we should just return one of those instead of
+ * continuing to GC dirty LEBs. Hence we request
+ * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
+ */
+ ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
+ if (ret) {
+ if (ret == -ENOSPC)
+ dbg_gc("no more dirty LEBs");
+ break;
+ }
+
+ dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
+ "(min. space %d)", lp.lnum, lp.free, lp.dirty,
+ lp.free + lp.dirty, min_space);
+
+ if (lp.free + lp.dirty == c->leb_size) {
+ /* An empty LEB was returned */
+ dbg_gc("LEB %d is free, return it", lp.lnum);
+ /*
+ * ubifs_find_dirty_leb() doesn't return freeable index
+ * LEBs.
+ */
+ ubifs_assert(!(lp.flags & LPROPS_INDEX));
+ if (lp.free != c->leb_size) {
+ /*
+ * Write buffers must be sync'd before
+ * unmapping freeable LEBs, because one of them
+ * may contain data which obsoletes something
+ * in 'lp.pnum'.
+ */
+ ret = gc_sync_wbufs(c);
+ if (ret)
+ goto out;
+ ret = ubifs_change_one_lp(c, lp.lnum,
+ c->leb_size, 0, 0, 0,
+ 0);
+ if (ret)
+ goto out;
+ }
+ ret = ubifs_leb_unmap(c, lp.lnum);
+ if (ret)
+ goto out;
+ ret = lp.lnum;
+ break;
+ }
+
+ space_before = c->leb_size - wbuf->offs - wbuf->used;
+ if (wbuf->lnum == -1)
+ space_before = 0;
+
+ ret = ubifs_garbage_collect_leb(c, &lp);
+ if (ret < 0) {
+ if (ret == -EAGAIN || ret == -ENOSPC) {
+ /*
+ * These codes are not errors, so we have to
+ * return the LEB to lprops. But if the
+ * 'ubifs_return_leb()' function fails, its
+ * failure code is propagated to the caller
+ * instead of the original '-EAGAIN' or
+ * '-ENOSPC'.
+ */
+ err = ubifs_return_leb(c, lp.lnum);
+ if (err)
+ ret = err;
+ break;
+ }
+ goto out;
+ }
+
+ if (ret == LEB_FREED) {
+ /* An LEB has been freed and is ready for use */
+ dbg_gc("LEB %d freed, return", lp.lnum);
+ ret = lp.lnum;
+ break;
+ }
+
+ if (ret == LEB_FREED_IDX) {
+ /*
+ * This was an indexing LEB and it cannot be
+ * immediately used. And instead of requesting the
+ * commit straight away, we try to garbage collect some
+ * more.
+ */
+ dbg_gc("indexing LEB %d freed, continue", lp.lnum);
+ continue;
+ }
+
+ ubifs_assert(ret == LEB_RETAINED);
+ space_after = c->leb_size - wbuf->offs - wbuf->used;
+ dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
+ space_after - space_before);
+
+ if (space_after > space_before) {
+ /* GC makes progress, keep working */
+ min_space >>= 1;
+ if (min_space < c->dead_wm)
+ min_space = c->dead_wm;
+ continue;
+ }
+
+ dbg_gc("did not make progress");
+
+ /*
+ * GC moved an LEB bud have not done any progress. This means
+ * that the previous GC head LEB contained too few free space
+ * and the LEB which was GC'ed contained only large nodes which
+ * did not fit that space.
+ *
+ * We can do 2 things:
+ * 1. pick another LEB in a hope it'll contain a small node
+ * which will fit the space we have at the end of current GC
+ * head LEB, but there is no guarantee, so we try this out
+ * unless we have already been working for too long;
+ * 2. request an LEB with more dirty space, which will force
+ * 'ubifs_find_dirty_leb()' to start scanning the lprops
+ * table, instead of just picking one from the heap
+ * (previously it already picked the dirtiest LEB).
+ */
+ if (i < SOFT_LEBS_LIMIT) {
+ dbg_gc("try again");
+ continue;
+ }
+
+ min_space <<= 1;
+ if (min_space > c->dark_wm)
+ min_space = c->dark_wm;
+ dbg_gc("set min. space to %d", min_space);
+ }
+
+ if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
+ dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
+ ubifs_commit_required(c);
+ ret = -EAGAIN;
+ }
+
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (!err)
+ err = ubifs_leb_unmap(c, c->gc_lnum);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+out_unlock:
+ mutex_unlock(&wbuf->io_mutex);
+ return ret;
+
+out:
+ ubifs_assert(ret < 0);
+ ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
+ ubifs_ro_mode(c, ret);
+ ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ ubifs_return_leb(c, lp.lnum);
+ return ret;
+}
+
+/**
+ * ubifs_gc_start_commit - garbage collection at start of commit.
+ * @c: UBIFS file-system description object
+ *
+ * If a LEB has only dirty and free space, then we may safely unmap it and make
+ * it free. Note, we cannot do this with indexing LEBs because dirty space may
+ * correspond index nodes that are required for recovery. In that case, the
+ * LEB cannot be unmapped until after the next commit.
+ *
+ * This function returns %0 upon success and a negative error code upon failure.
+ */
+int ubifs_gc_start_commit(struct ubifs_info *c)
+{
+ struct ubifs_gced_idx_leb *idx_gc;
+ const struct ubifs_lprops *lp;
+ int err = 0, flags;
+
+ ubifs_get_lprops(c);
+
+ /*
+ * Unmap (non-index) freeable LEBs. Note that recovery requires that all
+ * wbufs are sync'd before this, which is done in 'do_commit()'.
+ */
+ while (1) {
+ lp = ubifs_fast_find_freeable(c);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ if (!lp)
+ break;
+ ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+ err = ubifs_leb_unmap(c, lp->lnum);
+ if (err)
+ goto out;
+ lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+ }
+
+ /* Mark GC'd index LEBs OK to unmap after this commit finishes */
+ list_for_each_entry(idx_gc, &c->idx_gc, list)
+ idx_gc->unmap = 1;
+
+ /* Record index freeable LEBs for unmapping after commit */
+ while (1) {
+ lp = ubifs_fast_find_frdi_idx(c);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ if (!lp)
+ break;
+ idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+ if (!idx_gc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+ ubifs_assert(lp->flags & LPROPS_INDEX);
+ /* Don't release the LEB until after the next commit */
+ flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
+ lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ kfree(idx_gc);
+ goto out;
+ }
+ ubifs_assert(lp->flags & LPROPS_TAKEN);
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+ idx_gc->lnum = lp->lnum;
+ idx_gc->unmap = 1;
+ list_add(&idx_gc->list, &c->idx_gc);
+ }
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_gc_end_commit - garbage collection at end of commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function completes out-of-place garbage collection of index LEBs.
+ */
+int ubifs_gc_end_commit(struct ubifs_info *c)
+{
+ struct ubifs_gced_idx_leb *idx_gc, *tmp;
+ struct ubifs_wbuf *wbuf;
+ int err = 0;
+
+ wbuf = &c->jheads[GCHD].wbuf;
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
+ if (idx_gc->unmap) {
+ dbg_gc("LEB %d", idx_gc->lnum);
+ err = ubifs_leb_unmap(c, idx_gc->lnum);
+ if (err)
+ goto out;
+ err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+ LPROPS_NC, 0, LPROPS_TAKEN, -1);
+ if (err)
+ goto out;
+ list_del(&idx_gc->list);
+ kfree(idx_gc);
+ }
+out:
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+}
+
+/**
+ * ubifs_destroy_idx_gc - destroy idx_gc list.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the idx_gc list. It is called when unmounting or
+ * remounting read-only so locks are not needed.
+ */
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
+{
+ while (!list_empty(&c->idx_gc)) {
+ struct ubifs_gced_idx_leb *idx_gc;
+
+ idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
+ list);
+ c->idx_gc_cnt -= 1;
+ list_del(&idx_gc->list);
+ kfree(idx_gc);
+ }
+
+}
+
+/**
+ * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
+ * @c: UBIFS file-system description object
+ *
+ * Called during start commit so locks are not needed.
+ */
+int ubifs_get_idx_gc_leb(struct ubifs_info *c)
+{
+ struct ubifs_gced_idx_leb *idx_gc;
+ int lnum;
+
+ if (list_empty(&c->idx_gc))
+ return -ENOSPC;
+ idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
+ lnum = idx_gc->lnum;
+ /* c->idx_gc_cnt is updated by the caller when lprops are updated */
+ list_del(&idx_gc->list);
+ kfree(idx_gc);
+ return lnum;
+}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 00000000000..3374f91b670
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,914 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ * Zoltan Sogor
+ */
+
+/*
+ * This file implements UBIFS I/O subsystem which provides various I/O-related
+ * helper functions (reading/writing/checking/validating nodes) and implements
+ * write-buffering support. Write buffers help to save space which otherwise
+ * would have been wasted for padding to the nearest minimal I/O unit boundary.
+ * Instead, data first goes to the write-buffer and is flushed when the
+ * buffer is full or when it is not used for some time (by timer). This is
+ * similarto the mechanism is used by JFFS2.
+ *
+ * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
+ * mutexes defined inside these objects. Since sometimes upper-level code
+ * has to lock the write-buffer (e.g. journal space reservation code), many
+ * functions related to write-buffers have "nolock" suffix which means that the
+ * caller has to lock the write-buffer before calling this function.
+ *
+ * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
+ * aligned, UBIFS starts the next node from the aligned address, and the padded
+ * bytes may contain any rubbish. In other words, UBIFS does not put padding
+ * bytes in those small gaps. Common headers of nodes store real node lengths,
+ * not aligned lengths. Indexing nodes also store real lengths in branches.
+ *
+ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
+ * uses padding nodes or padding bytes, if the padding node does not fit.
+ *
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
+ * every time they are read from the flash media.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_check_node - check node.
+ * @c: UBIFS file-system description object
+ * @buf: node to check
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function checks node magic number and CRC checksum. This function also
+ * validates node length to prevent UBIFS from becoming crazy when an attacker
+ * feeds it a file-system image with incorrect nodes. For example, too large
+ * node length in the common header could cause UBIFS to read memory outside of
+ * allocated buffer when checking the CRC checksum.
+ *
+ * This function returns zero in case of success %-EUCLEAN in case of bad CRC
+ * or magic.
+ */
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+ int offs, int quiet)
+{
+ int err = -EINVAL, type, node_len;
+ uint32_t crc, node_crc, magic;
+ const struct ubifs_ch *ch = buf;
+
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+
+ magic = le32_to_cpu(ch->magic);
+ if (magic != UBIFS_NODE_MAGIC) {
+ if (!quiet)
+ ubifs_err("bad magic %#08x, expected %#08x",
+ magic, UBIFS_NODE_MAGIC);
+ err = -EUCLEAN;
+ goto out;
+ }
+
+ type = ch->node_type;
+ if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+ if (!quiet)
+ ubifs_err("bad node type %d", type);
+ goto out;
+ }
+
+ node_len = le32_to_cpu(ch->len);
+ if (node_len + offs > c->leb_size)
+ goto out_len;
+
+ if (c->ranges[type].max_len == 0) {
+ if (node_len != c->ranges[type].len)
+ goto out_len;
+ } else if (node_len < c->ranges[type].min_len ||
+ node_len > c->ranges[type].max_len)
+ goto out_len;
+
+ crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+ node_crc = le32_to_cpu(ch->crc);
+ if (crc != node_crc) {
+ if (!quiet)
+ ubifs_err("bad CRC: calculated %#08x, read %#08x",
+ crc, node_crc);
+ err = -EUCLEAN;
+ goto out;
+ }
+
+ return 0;
+
+out_len:
+ if (!quiet)
+ ubifs_err("bad node length %d", node_len);
+out:
+ if (!quiet) {
+ ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+/**
+ * ubifs_pad - pad flash space.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to put padding to
+ * @pad: how many bytes to pad
+ *
+ * The flash media obliges us to write only in chunks of %c->min_io_size and
+ * when we have to write less data we add padding node to the write-buffer and
+ * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
+ * media is being scanned. If the amount of wasted space is not enough to fit a
+ * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
+ * pattern (%UBIFS_PADDING_BYTE).
+ *
+ * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
+ * used.
+ */
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
+{
+ uint32_t crc;
+
+ ubifs_assert(pad >= 0 && !(pad & 7));
+
+ if (pad >= UBIFS_PAD_NODE_SZ) {
+ struct ubifs_ch *ch = buf;
+ struct ubifs_pad_node *pad_node = buf;
+
+ ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+ ch->node_type = UBIFS_PAD_NODE;
+ ch->group_type = UBIFS_NO_NODE_GROUP;
+ ch->padding[0] = ch->padding[1] = 0;
+ ch->sqnum = 0;
+ ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
+ pad -= UBIFS_PAD_NODE_SZ;
+ pad_node->pad_len = cpu_to_le32(pad);
+ crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
+ ch->crc = cpu_to_le32(crc);
+ memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
+ } else if (pad > 0)
+ /* Too little space, padding node won't fit */
+ memset(buf, UBIFS_PADDING_BYTE, pad);
+}
+
+/**
+ * next_sqnum - get next sequence number.
+ * @c: UBIFS file-system description object
+ */
+static unsigned long long next_sqnum(struct ubifs_info *c)
+{
+ unsigned long long sqnum;
+
+ spin_lock(&c->cnt_lock);
+ sqnum = ++c->max_sqnum;
+ spin_unlock(&c->cnt_lock);
+
+ if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
+ if (sqnum >= SQNUM_WATERMARK) {
+ ubifs_err("sequence number overflow %llu, end of life",
+ sqnum);
+ ubifs_ro_mode(c, -EINVAL);
+ }
+ ubifs_warn("running out of sequence numbers, end of life soon");
+ }
+
+ return sqnum;
+}
+
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+ uint32_t crc;
+ struct ubifs_ch *ch = node;
+ unsigned long long sqnum = next_sqnum(c);
+
+ ubifs_assert(len >= UBIFS_CH_SZ);
+
+ ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+ ch->len = cpu_to_le32(len);
+ ch->group_type = UBIFS_NO_NODE_GROUP;
+ ch->sqnum = cpu_to_le64(sqnum);
+ ch->padding[0] = ch->padding[1] = 0;
+ crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+ ch->crc = cpu_to_le32(crc);
+
+ if (pad) {
+ len = ALIGN(len, 8);
+ pad = ALIGN(len, c->min_io_size) - len;
+ ubifs_pad(c, node + len, pad);
+ }
+}
+
+/**
+ * ubifs_prep_grp_node - prepare node of a group to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @last: indicates the last node of the group
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC and fills the common header.
+ */
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
+{
+ uint32_t crc;
+ struct ubifs_ch *ch = node;
+ unsigned long long sqnum = next_sqnum(c);
+
+ ubifs_assert(len >= UBIFS_CH_SZ);
+
+ ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+ ch->len = cpu_to_le32(len);
+ if (last)
+ ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
+ else
+ ch->group_type = UBIFS_IN_NODE_GROUP;
+ ch->sqnum = cpu_to_le64(sqnum);
+ ch->padding[0] = ch->padding[1] = 0;
+ crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+ ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * wbuf_timer_callback - write-buffer timer callback function.
+ * @data: timer data (write-buffer descriptor)
+ *
+ * This function is called when the write-buffer timer expires.
+ */
+static void wbuf_timer_callback_nolock(unsigned long data)
+{
+ struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+
+ wbuf->need_sync = 1;
+ wbuf->c->need_wbuf_sync = 1;
+ ubifs_wake_up_bgt(wbuf->c);
+}
+
+/**
+ * new_wbuf_timer - start new write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+ ubifs_assert(!timer_pending(&wbuf->timer));
+
+ if (!wbuf->timeout)
+ return;
+
+ wbuf->timer.expires = jiffies + wbuf->timeout;
+ add_timer(&wbuf->timer);
+}
+
+/**
+ * cancel_wbuf_timer - cancel write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+ /*
+ * If the syncer is waiting for the lock (from the background thread's
+ * context) and another task is changing write-buffer then the syncing
+ * should be canceled.
+ */
+ wbuf->need_sync = 0;
+ del_timer(&wbuf->timer);
+}
+
+/**
+ * ubifs_wbuf_sync_nolock - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This function synchronizes write-buffer @buf and returns zero in case of
+ * success or a negative error code in case of failure.
+ */
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
+{
+ struct ubifs_info *c = wbuf->c;
+ int err, dirt;
+
+ cancel_wbuf_timer_nolock(wbuf);
+ if (!wbuf->used || wbuf->lnum == -1)
+ /* Write-buffer is empty or not seeked */
+ return 0;
+
+ dbg_io("LEB %d:%d, %d bytes",
+ wbuf->lnum, wbuf->offs, wbuf->used);
+ ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+ ubifs_assert(!(wbuf->avail & 7));
+ ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+
+ if (c->ro_media)
+ return -EROFS;
+
+ ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+ c->min_io_size, wbuf->dtype);
+ if (err) {
+ ubifs_err("cannot write %d bytes to LEB %d:%d",
+ c->min_io_size, wbuf->lnum, wbuf->offs);
+ dbg_dump_stack();
+ return err;
+ }
+
+ dirt = wbuf->avail;
+
+ spin_lock(&wbuf->lock);
+ wbuf->offs += c->min_io_size;
+ wbuf->avail = c->min_io_size;
+ wbuf->used = 0;
+ wbuf->next_ino = 0;
+ spin_unlock(&wbuf->lock);
+
+ if (wbuf->sync_callback)
+ err = wbuf->sync_callback(c, wbuf->lnum,
+ c->leb_size - wbuf->offs, dirt);
+ return err;
+}
+
+/**
+ * ubifs_wbuf_seek_nolock - seek write-buffer.
+ * @wbuf: write-buffer
+ * @lnum: logical eraseblock number to seek to
+ * @offs: logical eraseblock offset to seek to
+ * @dtype: data type
+ *
+ * This function targets the write buffer to logical eraseblock @lnum:@offs.
+ * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+ int dtype)
+{
+ const struct ubifs_info *c = wbuf->c;
+
+ dbg_io("LEB %d:%d", lnum, offs);
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
+ ubifs_assert(offs >= 0 && offs <= c->leb_size);
+ ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
+ ubifs_assert(lnum != wbuf->lnum);
+
+ if (wbuf->used > 0) {
+ int err = ubifs_wbuf_sync_nolock(wbuf);
+
+ if (err)
+ return err;
+ }
+
+ spin_lock(&wbuf->lock);
+ wbuf->lnum = lnum;
+ wbuf->offs = offs;
+ wbuf->avail = c->min_io_size;
+ wbuf->used = 0;
+ spin_unlock(&wbuf->lock);
+ wbuf->dtype = dtype;
+
+ return 0;
+}
+
+/**
+ * ubifs_bg_wbufs_sync - synchronize write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by background thread to synchronize write-buffers.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_bg_wbufs_sync(struct ubifs_info *c)
+{
+ int err, i;
+
+ if (!c->need_wbuf_sync)
+ return 0;
+ c->need_wbuf_sync = 0;
+
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_timers;
+ }
+
+ dbg_io("synchronize");
+ for (i = 0; i < c->jhead_cnt; i++) {
+ struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+ cond_resched();
+
+ /*
+ * If the mutex is locked then wbuf is being changed, so
+ * synchronization is not necessary.
+ */
+ if (mutex_is_locked(&wbuf->io_mutex))
+ continue;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ if (!wbuf->need_sync) {
+ mutex_unlock(&wbuf->io_mutex);
+ continue;
+ }
+
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ if (err) {
+ ubifs_err("cannot sync write-buffer, error %d", err);
+ ubifs_ro_mode(c, err);
+ goto out_timers;
+ }
+ }
+
+ return 0;
+
+out_timers:
+ /* Cancel all timers to prevent repeated errors */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ cancel_wbuf_timer_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ }
+ return err;
+}
+
+/**
+ * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
+ * @wbuf: write-buffer
+ * @buf: node to write
+ * @len: node length
+ *
+ * This function writes data to flash via write-buffer @wbuf. This means that
+ * the last piece of the node won't reach the flash media immediately if it
+ * does not take whole minimal I/O unit. Instead, the node will sit in RAM
+ * until the write-buffer is synchronized (e.g., by timer).
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the node cannot be written because there is no more
+ * space in this logical eraseblock, %-ENOSPC is returned.
+ */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
+{
+ struct ubifs_info *c = wbuf->c;
+ int err, written, n, aligned_len = ALIGN(len, 8), offs;
+
+ dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
+ dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
+ wbuf->offs + wbuf->used);
+ ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
+ ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
+ ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
+ ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+ ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+
+ if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ cancel_wbuf_timer_nolock(wbuf);
+
+ if (c->ro_media)
+ return -EROFS;
+
+ if (aligned_len <= wbuf->avail) {
+ /*
+ * The node is not very large and fits entirely within
+ * write-buffer.
+ */
+ memcpy(wbuf->buf + wbuf->used, buf, len);
+
+ if (aligned_len == wbuf->avail) {
+ dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
+ wbuf->offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
+ wbuf->offs, c->min_io_size,
+ wbuf->dtype);
+ if (err)
+ goto out;
+
+ spin_lock(&wbuf->lock);
+ wbuf->offs += c->min_io_size;
+ wbuf->avail = c->min_io_size;
+ wbuf->used = 0;
+ wbuf->next_ino = 0;
+ spin_unlock(&wbuf->lock);
+ } else {
+ spin_lock(&wbuf->lock);
+ wbuf->avail -= aligned_len;
+ wbuf->used += aligned_len;
+ spin_unlock(&wbuf->lock);
+ }
+
+ goto exit;
+ }
+
+ /*
+ * The node is large enough and does not fit entirely within current
+ * minimal I/O unit. We have to fill and flush write-buffer and switch
+ * to the next min. I/O unit.
+ */
+ dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
+ memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+ c->min_io_size, wbuf->dtype);
+ if (err)
+ goto out;
+
+ offs = wbuf->offs + c->min_io_size;
+ len -= wbuf->avail;
+ aligned_len -= wbuf->avail;
+ written = wbuf->avail;
+
+ /*
+ * The remaining data may take more whole min. I/O units, so write the
+ * remains multiple to min. I/O unit size directly to the flash media.
+ * We align node length to 8-byte boundary because we anyway flash wbuf
+ * if the remaining space is less than 8 bytes.
+ */
+ n = aligned_len >> c->min_io_shift;
+ if (n) {
+ n <<= c->min_io_shift;
+ dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+ wbuf->dtype);
+ if (err)
+ goto out;
+ offs += n;
+ aligned_len -= n;
+ len -= n;
+ written += n;
+ }
+
+ spin_lock(&wbuf->lock);
+ if (aligned_len)
+ /*
+ * And now we have what's left and what does not take whole
+ * min. I/O unit, so write it to the write-buffer and we are
+ * done.
+ */
+ memcpy(wbuf->buf, buf + written, len);
+
+ wbuf->offs = offs;
+ wbuf->used = aligned_len;
+ wbuf->avail = c->min_io_size - aligned_len;
+ wbuf->next_ino = 0;
+ spin_unlock(&wbuf->lock);
+
+exit:
+ if (wbuf->sync_callback) {
+ int free = c->leb_size - wbuf->offs - wbuf->used;
+
+ err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
+ if (err)
+ goto out;
+ }
+
+ if (wbuf->used)
+ new_wbuf_timer_nolock(wbuf);
+
+ return 0;
+
+out:
+ ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+ len, wbuf->lnum, wbuf->offs, err);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ dbg_dump_leb(c, wbuf->lnum);
+ return err;
+}
+
+/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int dtype)
+{
+ int err, buf_len = ALIGN(len, c->min_io_size);
+
+ dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+ lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+ buf_len);
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+
+ if (c->ro_media)
+ return -EROFS;
+
+ ubifs_prepare_node(c, buf, len, 1);
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
+ if (err) {
+ ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+ buf_len, lnum, offs, err);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ }
+
+ return err;
+}
+
+/**
+ * ubifs_read_node_wbuf - read node from the media or write-buffer.
+ * @wbuf: wbuf to check for un-written data
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and length, checks it and stores
+ * in @buf. If the node partially or fully sits in the write-buffer, this
+ * function takes data from the buffer, otherwise it reads the flash media.
+ * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
+ * error code in case of failure.
+ */
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+ int lnum, int offs)
+{
+ const struct ubifs_info *c = wbuf->c;
+ int err, rlen, overlap;
+ struct ubifs_ch *ch = buf;
+
+ dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+ ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+ ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+ spin_lock(&wbuf->lock);
+ overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+ if (!overlap) {
+ /* We may safely unlock the write-buffer and read the data */
+ spin_unlock(&wbuf->lock);
+ return ubifs_read_node(c, buf, type, len, lnum, offs);
+ }
+
+ /* Don't read under wbuf */
+ rlen = wbuf->offs - offs;
+ if (rlen < 0)
+ rlen = 0;
+
+ /* Copy the rest from the write-buffer */
+ memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+ spin_unlock(&wbuf->lock);
+
+ if (rlen > 0) {
+ /* Read everything that goes before write-buffer */
+ err = ubi_read(c->ubi, lnum, buf, offs, rlen);
+ if (err && err != -EBADMSG) {
+ ubifs_err("failed to read node %d from LEB %d:%d, "
+ "error %d", type, lnum, offs, err);
+ dbg_dump_stack();
+ return err;
+ }
+ }
+
+ if (type != ch->node_type) {
+ ubifs_err("bad node type (%d but expected %d)",
+ ch->node_type, type);
+ goto out;
+ }
+
+ err = ubifs_check_node(c, buf, lnum, offs, 0);
+ if (err) {
+ ubifs_err("expected node type %d", type);
+ return err;
+ }
+
+ rlen = le32_to_cpu(ch->len);
+ if (rlen != len) {
+ ubifs_err("bad node length %d, expected %d", rlen, len);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ return -EINVAL;
+}
+
+/**
+ * ubifs_read_node - read node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and and length, checks it and
+ * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
+ * and a negative error code in case of failure.
+ */
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+ int lnum, int offs)
+{
+ int err, l;
+ struct ubifs_ch *ch = buf;
+
+ dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+ ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ if (err && err != -EBADMSG) {
+ ubifs_err("cannot read node %d from LEB %d:%d, error %d",
+ type, lnum, offs, err);
+ return err;
+ }
+
+ if (type != ch->node_type) {
+ ubifs_err("bad node type (%d but expected %d)",
+ ch->node_type, type);
+ goto out;
+ }
+
+ err = ubifs_check_node(c, buf, lnum, offs, 0);
+ if (err) {
+ ubifs_err("expected node type %d", type);
+ return err;
+ }
+
+ l = le32_to_cpu(ch->len);
+ if (l != len) {
+ ubifs_err("bad node length %d, expected %d", l, len);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ return -EINVAL;
+}
+
+/**
+ * ubifs_wbuf_init - initialize write-buffer.
+ * @c: UBIFS file-system description object
+ * @wbuf: write-buffer to initialize
+ *
+ * This function initializes write buffer. Returns zero in case of success
+ * %-ENOMEM in case of failure.
+ */
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
+{
+ size_t size;
+
+ wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+ if (!wbuf->buf)
+ return -ENOMEM;
+
+ size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+ wbuf->inodes = kmalloc(size, GFP_KERNEL);
+ if (!wbuf->inodes) {
+ kfree(wbuf->buf);
+ wbuf->buf = NULL;
+ return -ENOMEM;
+ }
+
+ wbuf->used = 0;
+ wbuf->lnum = wbuf->offs = -1;
+ wbuf->avail = c->min_io_size;
+ wbuf->dtype = UBI_UNKNOWN;
+ wbuf->sync_callback = NULL;
+ mutex_init(&wbuf->io_mutex);
+ spin_lock_init(&wbuf->lock);
+
+ wbuf->c = c;
+ init_timer(&wbuf->timer);
+ wbuf->timer.function = wbuf_timer_callback_nolock;
+ wbuf->timer.data = (unsigned long)wbuf;
+ wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
+ wbuf->next_ino = 0;
+
+ return 0;
+}
+
+/**
+ * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
+ * @wbuf: the write-buffer whereto add
+ * @inum: the inode number
+ *
+ * This function adds an inode number to the inode array of the write-buffer.
+ */
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+ if (!wbuf->buf)
+ /* NOR flash or something similar */
+ return;
+
+ spin_lock(&wbuf->lock);
+ if (wbuf->used)
+ wbuf->inodes[wbuf->next_ino++] = inum;
+ spin_unlock(&wbuf->lock);
+}
+
+/**
+ * wbuf_has_ino - returns if the wbuf contains data from the inode.
+ * @wbuf: the write-buffer
+ * @inum: the inode number
+ *
+ * This function returns with %1 if the write-buffer contains some data from the
+ * given inode otherwise it returns with %0.
+ */
+static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+ int i, ret = 0;
+
+ spin_lock(&wbuf->lock);
+ for (i = 0; i < wbuf->next_ino; i++)
+ if (inum == wbuf->inodes[i]) {
+ ret = 1;
+ break;
+ }
+ spin_unlock(&wbuf->lock);
+
+ return ret;
+}
+
+/**
+ * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to synchronize
+ *
+ * This function synchronizes write-buffers which contain nodes belonging to
+ * @inode. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
+{
+ int i, err = 0;
+
+ for (i = 0; i < c->jhead_cnt; i++) {
+ struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+ if (i == GCHD)
+ /*
+ * GC head is special, do not look at it. Even if the
+ * head contains something related to this inode, it is
+ * a _copy_ of corresponding on-flash node which sits
+ * somewhere else.
+ */
+ continue;
+
+ if (!wbuf_has_ino(wbuf, inode->i_ino))
+ continue;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ if (wbuf_has_ino(wbuf, inode->i_ino))
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+
+ if (err) {
+ ubifs_ro_mode(c, err);
+ return err;
+ }
+ }
+ return 0;
+}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 00000000000..5e82cffe969
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,204 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Zoltan Sogor
+ * Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/* This file implements EXT2-compatible extended attribute ioctl() calls */
+
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_set_inode_flags - set VFS inode flags.
+ * @inode: VFS inode to set flags for
+ *
+ * This function propagates flags from UBIFS inode object to VFS inode object.
+ */
+void ubifs_set_inode_flags(struct inode *inode)
+{
+ unsigned int flags = ubifs_inode(inode)->flags;
+
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+ if (flags & UBIFS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & UBIFS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & UBIFS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & UBIFS_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
+ * @ioctl_flags: flags to convert
+ *
+ * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * (@UBIFS_COMPR_FL, etc).
+ */
+static int ioctl2ubifs(int ioctl_flags)
+{
+ int ubifs_flags = 0;
+
+ if (ioctl_flags & FS_COMPR_FL)
+ ubifs_flags |= UBIFS_COMPR_FL;
+ if (ioctl_flags & FS_SYNC_FL)
+ ubifs_flags |= UBIFS_SYNC_FL;
+ if (ioctl_flags & FS_APPEND_FL)
+ ubifs_flags |= UBIFS_APPEND_FL;
+ if (ioctl_flags & FS_IMMUTABLE_FL)
+ ubifs_flags |= UBIFS_IMMUTABLE_FL;
+ if (ioctl_flags & FS_DIRSYNC_FL)
+ ubifs_flags |= UBIFS_DIRSYNC_FL;
+
+ return ubifs_flags;
+}
+
+/*
+ * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
+ * @ubifs_flags: flags to convert
+ *
+ * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
+ * (@FS_COMPR_FL, etc).
+ */
+static int ubifs2ioctl(int ubifs_flags)
+{
+ int ioctl_flags = 0;
+
+ if (ubifs_flags & UBIFS_COMPR_FL)
+ ioctl_flags |= FS_COMPR_FL;
+ if (ubifs_flags & UBIFS_SYNC_FL)
+ ioctl_flags |= FS_SYNC_FL;
+ if (ubifs_flags & UBIFS_APPEND_FL)
+ ioctl_flags |= FS_APPEND_FL;
+ if (ubifs_flags & UBIFS_IMMUTABLE_FL)
+ ioctl_flags |= FS_IMMUTABLE_FL;
+ if (ubifs_flags & UBIFS_DIRSYNC_FL)
+ ioctl_flags |= FS_DIRSYNC_FL;
+
+ return ioctl_flags;
+}
+
+static int setflags(struct inode *inode, int flags)
+{
+ int oldflags, err, release;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ui->data_len };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ */
+ mutex_lock(&ui->ui_mutex);
+ oldflags = ubifs2ioctl(ui->flags);
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ }
+
+ ui->flags = ioctl2ubifs(flags);
+ ubifs_set_inode_flags(inode);
+ inode->i_ctime = ubifs_current_time(inode);
+ release = ui->dirty;
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+
+ if (release)
+ ubifs_release_budget(c, &req);
+ if (IS_SYNC(inode))
+ err = write_inode_now(inode, 1);
+ return err;
+
+out_unlock:
+ ubifs_err("can't modify inode %lu attributes", inode->i_ino);
+ mutex_unlock(&ui->ui_mutex);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int flags, err;
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+
+ return put_user(flags, (int __user *) arg);
+
+ case FS_IOC_SETFLAGS: {
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~FS_DIRSYNC_FL;
+
+ /*
+ * Make sure the file-system is read-write and make sure it
+ * will not become read-only while we are changing the flags.
+ */
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ return err;
+ err = setflags(inode, flags);
+ mnt_drop_write(file->f_path.mnt);
+ return err;
+ }
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 00000000000..283155abe5f
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS journal.
+ *
+ * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
+ * length and position, while a bud logical eraseblock is any LEB in the main
+ * area. Buds contain file system data - data nodes, inode nodes, etc. The log
+ * contains only references to buds and some other stuff like commit
+ * start node. The idea is that when we commit the journal, we do
+ * not copy the data, the buds just become indexed. Since after the commit the
+ * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
+ * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
+ * become leafs in the future.
+ *
+ * The journal is multi-headed because we want to write data to the journal as
+ * optimally as possible. It is nice to have nodes belonging to the same inode
+ * in one LEB, so we may write data owned by different inodes to different
+ * journal heads, although at present only one data head is used.
+ *
+ * For recovery reasons, the base head contains all inode nodes, all directory
+ * entry nodes and all truncate nodes. This means that the other heads contain
+ * only data nodes.
+ *
+ * Bud LEBs may be half-indexed. For example, if the bud was not full at the
+ * time of commit, the bud is retained to continue to be used in the journal,
+ * even though the "front" of the LEB is now indexed. In that case, the log
+ * reference contains the offset where the bud starts for the purposes of the
+ * journal.
+ *
+ * The journal size has to be limited, because the larger is the journal, the
+ * longer it takes to mount UBIFS (scanning the journal) and the more memory it
+ * takes (indexing in the TNC).
+ *
+ * All the journal write operations like 'ubifs_jnl_update()' here, which write
+ * multiple UBIFS nodes to the journal at one go, are atomic with respect to
+ * unclean reboots. Should the unclean reboot happen, the recovery code drops
+ * all the nodes.
+ */
+
+#include "ubifs.h"
+
+/**
+ * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
+ * @ino: the inode to zero out
+ */
+static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
+{
+ memset(ino->padding1, 0, 4);
+ memset(ino->padding2, 0, 26);
+}
+
+/**
+ * zero_dent_node_unused - zero out unused fields of an on-flash directory
+ * entry node.
+ * @dent: the directory entry to zero out
+ */
+static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
+{
+ dent->padding1 = 0;
+ memset(dent->padding2, 0, 4);
+}
+
+/**
+ * zero_data_node_unused - zero out unused fields of an on-flash data node.
+ * @data: the data node to zero out
+ */
+static inline void zero_data_node_unused(struct ubifs_data_node *data)
+{
+ memset(data->padding, 0, 2);
+}
+
+/**
+ * zero_trun_node_unused - zero out unused fields of an on-flash truncation
+ * node.
+ * @trun: the truncation node to zero out
+ */
+static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
+{
+ memset(trun->padding, 0, 12);
+}
+
+/**
+ * reserve_space - reserve space in the journal.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head number
+ * @len: node length
+ *
+ * This function reserves space in journal head @head. If the reservation
+ * succeeded, the journal head stays locked and later has to be unlocked using
+ * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
+ * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
+ * other negative error codes in case of other failures.
+ */
+static int reserve_space(struct ubifs_info *c, int jhead, int len)
+{
+ int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+ struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+ /*
+ * Typically, the base head has smaller nodes written to it, so it is
+ * better to try to allocate space at the ends of eraseblocks. This is
+ * what the squeeze parameter does.
+ */
+ squeeze = (jhead == BASEHD);
+again:
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_unlock;
+ }
+
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ if (wbuf->lnum != -1 && avail >= len)
+ return 0;
+
+ /*
+ * Write buffer wasn't seek'ed or there is no enough space - look for an
+ * LEB with some empty space.
+ */
+ lnum = ubifs_find_free_space(c, len, &free, squeeze);
+ if (lnum >= 0) {
+ /* Found an LEB, add it to the journal head */
+ offs = c->leb_size - free;
+ err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+ if (err)
+ goto out_return;
+ /* A new bud was successfully allocated and added to the log */
+ goto out;
+ }
+
+ err = lnum;
+ if (err != -ENOSPC)
+ goto out_unlock;
+
+ /*
+ * No free space, we have to run garbage collector to make
+ * some. But the write-buffer mutex has to be unlocked because
+ * GC also takes it.
+ */
+ dbg_jnl("no free space jhead %d, run GC", jhead);
+ mutex_unlock(&wbuf->io_mutex);
+
+ lnum = ubifs_garbage_collect(c, 0);
+ if (lnum < 0) {
+ err = lnum;
+ if (err != -ENOSPC)
+ return err;
+
+ /*
+ * GC could not make a free LEB. But someone else may
+ * have allocated new bud for this journal head,
+ * because we dropped @wbuf->io_mutex, so try once
+ * again.
+ */
+ dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+ if (retries++ < 2) {
+ dbg_jnl("retry (%d)", retries);
+ goto again;
+ }
+
+ dbg_jnl("return -ENOSPC");
+ return err;
+ }
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+
+ if (wbuf->lnum != -1 && avail >= len) {
+ /*
+ * Someone else has switched the journal head and we have
+ * enough space now. This happens when more then one process is
+ * trying to write to the same journal head at the same time.
+ */
+ dbg_jnl("return LEB %d back, already have LEB %d:%d",
+ lnum, wbuf->lnum, wbuf->offs + wbuf->used);
+ err = ubifs_return_leb(c, lnum);
+ if (err)
+ goto out_unlock;
+ return 0;
+ }
+
+ err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
+ if (err)
+ goto out_return;
+ offs = 0;
+
+out:
+ err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+ if (err)
+ goto out_unlock;
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+
+out_return:
+ /* An error occurred and the LEB has to be returned to lprops */
+ ubifs_assert(err < 0);
+ err1 = ubifs_return_leb(c, lnum);
+ if (err1 && err == -EAGAIN)
+ /*
+ * Return original error code only if it is not %-EAGAIN,
+ * which is not really an error. Otherwise, return the error
+ * code of 'ubifs_return_leb()'.
+ */
+ err = err1;
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+}
+
+/**
+ * write_node - write node to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @node: node to write
+ * @len: node length
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function writes a node to reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
+ int *lnum, int *offs)
+{
+ struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+ ubifs_assert(jhead != GCHD);
+
+ *lnum = c->jheads[jhead].wbuf.lnum;
+ *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+
+ dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ ubifs_prepare_node(c, node, len, 0);
+
+ return ubifs_wbuf_write_nolock(wbuf, node, len);
+}
+
+/**
+ * write_head - write data to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @buf: buffer to write
+ * @len: length to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ * @sync: non-zero if the write-buffer has to by synchronized
+ *
+ * This function is the same as 'write_node()' but it does not assume the
+ * buffer it is writing is a node, so it does not prepare it (which means
+ * initializing common header and calculating CRC).
+ */
+static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
+ int *lnum, int *offs, int sync)
+{
+ int err;
+ struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+ ubifs_assert(jhead != GCHD);
+
+ *lnum = c->jheads[jhead].wbuf.lnum;
+ *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+ dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+
+ err = ubifs_wbuf_write_nolock(wbuf, buf, len);
+ if (err)
+ return err;
+ if (sync)
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ return err;
+}
+
+/**
+ * make_reservation - reserve journal space.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @len: how many bytes to reserve
+ *
+ * This function makes space reservation in journal head @jhead. The function
+ * takes the commit lock and locks the journal head, and the caller has to
+ * unlock the head and finish the reservation with 'finish_reservation()'.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, the journal head may be unlocked as soon as the data is written, while
+ * the commit lock has to be released after the data has been added to the
+ * TNC.
+ */
+static int make_reservation(struct ubifs_info *c, int jhead, int len)
+{
+ int err, cmt_retries = 0, nospc_retries = 0;
+
+again:
+ down_read(&c->commit_sem);
+ err = reserve_space(c, jhead, len);
+ if (!err)
+ return 0;
+ up_read(&c->commit_sem);
+
+ if (err == -ENOSPC) {
+ /*
+ * GC could not make any progress. We should try to commit
+ * once because it could make some dirty space and GC would
+ * make progress, so make the error -EAGAIN so that the below
+ * will commit and re-try.
+ */
+ if (nospc_retries++ < 2) {
+ dbg_jnl("no space, retry");
+ err = -EAGAIN;
+ }
+
+ /*
+ * This means that the budgeting is incorrect. We always have
+ * to be able to write to the media, because all operations are
+ * budgeted. Deletions are not budgeted, though, but we reserve
+ * an extra LEB for them.
+ */
+ }
+
+ if (err != -EAGAIN)
+ goto out;
+
+ /*
+ * -EAGAIN means that the journal is full or too large, or the above
+ * code wants to do one commit. Do this and re-try.
+ */
+ if (cmt_retries > 128) {
+ /*
+ * This should not happen unless the journal size limitations
+ * are too tough.
+ */
+ ubifs_err("stuck in space allocation");
+ err = -ENOSPC;
+ goto out;
+ } else if (cmt_retries > 32)
+ ubifs_warn("too many space allocation re-tries (%d)",
+ cmt_retries);
+
+ dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
+ cmt_retries);
+ cmt_retries += 1;
+
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ goto again;
+
+out:
+ ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
+ len, jhead, err);
+ if (err == -ENOSPC) {
+ /* This are some budgeting problems, print useful information */
+ down_write(&c->commit_sem);
+ spin_lock(&c->space_lock);
+ dbg_dump_stack();
+ dbg_dump_budg(c);
+ spin_unlock(&c->space_lock);
+ dbg_dump_lprops(c);
+ cmt_retries = dbg_check_lprops(c);
+ up_write(&c->commit_sem);
+ }
+ return err;
+}
+
+/**
+ * release_head - release a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ *
+ * This function releases journal head @jhead which was locked by
+ * the 'make_reservation()' function. It has to be called after each successful
+ * 'make_reservation()' invocation.
+ */
+static inline void release_head(struct ubifs_info *c, int jhead)
+{
+ mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
+}
+
+/**
+ * finish_reservation - finish a reservation.
+ * @c: UBIFS file-system description object
+ *
+ * This function finishes journal space reservation. It must be called after
+ * 'make_reservation()'.
+ */
+static void finish_reservation(struct ubifs_info *c)
+{
+ up_read(&c->commit_sem);
+}
+
+/**
+ * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
+ * @mode: inode mode
+ */
+static int get_dent_type(int mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ return UBIFS_ITYPE_REG;
+ case S_IFDIR:
+ return UBIFS_ITYPE_DIR;
+ case S_IFLNK:
+ return UBIFS_ITYPE_LNK;
+ case S_IFBLK:
+ return UBIFS_ITYPE_BLK;
+ case S_IFCHR:
+ return UBIFS_ITYPE_CHR;
+ case S_IFIFO:
+ return UBIFS_ITYPE_FIFO;
+ case S_IFSOCK:
+ return UBIFS_ITYPE_SOCK;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+/**
+ * pack_inode - pack an inode node.
+ * @c: UBIFS file-system description object
+ * @ino: buffer in which to pack inode node
+ * @inode: inode to pack
+ * @last: indicates the last node of the group
+ * @last_reference: non-zero if this is a deletion inode
+ */
+static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
+ const struct inode *inode, int last,
+ int last_reference)
+{
+ int data_len = 0;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ino->ch.node_type = UBIFS_INO_NODE;
+ ino_key_init_flash(c, &ino->key, inode->i_ino);
+ ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
+ ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
+ ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec);
+ ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
+ ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ino->uid = cpu_to_le32(inode->i_uid);
+ ino->gid = cpu_to_le32(inode->i_gid);
+ ino->mode = cpu_to_le32(inode->i_mode);
+ ino->flags = cpu_to_le32(ui->flags);
+ ino->size = cpu_to_le64(ui->ui_size);
+ ino->nlink = cpu_to_le32(inode->i_nlink);
+ ino->compr_type = cpu_to_le16(ui->compr_type);
+ ino->data_len = cpu_to_le32(ui->data_len);
+ ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt);
+ ino->xattr_size = cpu_to_le32(ui->xattr_size);
+ ino->xattr_names = cpu_to_le32(ui->xattr_names);
+ zero_ino_node_unused(ino);
+
+ /*
+ * Drop the attached data if this is a deletion inode, the data is not
+ * needed anymore.
+ */
+ if (!last_reference) {
+ memcpy(ino->data, ui->data, ui->data_len);
+ data_len = ui->data_len;
+ }
+
+ ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
+}
+
+/**
+ * mark_inode_clean - mark UBIFS inode as clean.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to mark as clean
+ *
+ * This helper function marks UBIFS inode @ui as clean by cleaning the
+ * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
+ * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
+ * just do nothing.
+ */
+static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
+{
+ if (ui->dirty)
+ ubifs_release_dirty_inode_budget(c, ui);
+ ui->dirty = 0;
+}
+
+/**
+ * ubifs_jnl_update - update inode.
+ * @c: UBIFS file-system description object
+ * @dir: parent inode or host inode in case of extended attributes
+ * @nm: directory entry name
+ * @inode: inode to update
+ * @deletion: indicates a directory entry deletion i.e unlink or rmdir
+ * @xent: non-zero if the directory entry is an extended attribute entry
+ *
+ * This function updates an inode by writing a directory entry (or extended
+ * attribute entry), the inode itself, and the parent directory inode (or the
+ * host inode) to the journal.
+ *
+ * The function writes the host inode @dir last, which is important in case of
+ * extended attributes. Indeed, then we guarantee that if the host inode gets
+ * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
+ * the extended attribute inode gets flushed too. And this is exactly what the
+ * user expects - synchronizing the host inode synchronizes its extended
+ * attributes. Similarly, this guarantees that if @dir is synchronized, its
+ * directory entry corresponding to @nm gets synchronized too.
+ *
+ * If the inode (@inode) or the parent directory (@dir) are synchronous, this
+ * function synchronizes the write-buffer.
+ *
+ * This function marks the @dir and @inode inodes as clean and returns zero on
+ * success. In case of failure, a negative error code is returned.
+ */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+ const struct qstr *nm, const struct inode *inode,
+ int deletion, int xent)
+{
+ int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+ int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
+ int last_reference = !!(deletion && inode->i_nlink == 0);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_dent_node *dent;
+ struct ubifs_ino_node *ino;
+ union ubifs_key dent_key, ino_key;
+
+ dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+ inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+ ubifs_assert(dir_ui->data_len == 0);
+ ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
+
+ dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ ilen = UBIFS_INO_NODE_SZ;
+
+ /*
+ * If the last reference to the inode is being deleted, then there is
+ * no need to attach and write inode data, it is being deleted anyway.
+ * And if the inode is being deleted, no need to synchronize
+ * write-buffer even if the inode is synchronous.
+ */
+ if (!last_reference) {
+ ilen += ui->data_len;
+ sync |= IS_SYNC(inode);
+ }
+
+ aligned_dlen = ALIGN(dlen, 8);
+ aligned_ilen = ALIGN(ilen, 8);
+ len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+ dent = kmalloc(len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ if (!xent) {
+ dent->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init(c, &dent_key, dir->i_ino, nm);
+ } else {
+ dent->ch.node_type = UBIFS_XENT_NODE;
+ xent_key_init(c, &dent_key, dir->i_ino, nm);
+ }
+
+ key_write(c, &dent_key, dent->key);
+ dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
+ dent->type = get_dent_type(inode->i_mode);
+ dent->nlen = cpu_to_le16(nm->len);
+ memcpy(dent->name, nm->name, nm->len);
+ dent->name[nm->len] = '\0';
+ zero_dent_node_unused(dent);
+ ubifs_prep_grp_node(c, dent, dlen, 0);
+
+ ino = (void *)dent + aligned_dlen;
+ pack_inode(c, ino, inode, 0, last_reference);
+ ino = (void *)ino + aligned_ilen;
+ pack_inode(c, ino, dir, 1, 0);
+
+ if (last_reference) {
+ err = ubifs_add_orphan(c, inode->i_ino);
+ if (err) {
+ release_head(c, BASEHD);
+ goto out_finish;
+ }
+ }
+
+ err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
+ }
+ release_head(c, BASEHD);
+ kfree(dent);
+
+ if (deletion) {
+ err = ubifs_tnc_remove_nm(c, &dent_key, nm);
+ if (err)
+ goto out_ro;
+ err = ubifs_add_dirt(c, lnum, dlen);
+ } else
+ err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+ if (err)
+ goto out_ro;
+
+ /*
+ * Note, we do not remove the inode from TNC even if the last reference
+ * to it has just been deleted, because the inode may still be opened.
+ * Instead, the inode has been added to orphan lists and the orphan
+ * subsystem will take further care about it.
+ */
+ ino_key_init(c, &ino_key, inode->i_ino);
+ ino_offs = dent_offs + aligned_dlen;
+ err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+ if (err)
+ goto out_ro;
+
+ ino_key_init(c, &ino_key, dir->i_ino);
+ ino_offs += aligned_ilen;
+ err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&ui->ui_lock);
+ ui->synced_i_size = ui->ui_size;
+ spin_unlock(&ui->ui_lock);
+ mark_inode_clean(c, ui);
+ mark_inode_clean(c, dir_ui);
+ return 0;
+
+out_finish:
+ finish_reservation(c);
+out_free:
+ kfree(dent);
+ return err;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ if (last_reference)
+ ubifs_delete_orphan(c, inode->i_ino);
+ finish_reservation(c);
+ return err;
+}
+
+/**
+ * ubifs_jnl_write_data - write a data node to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode the data node belongs to
+ * @key: node key
+ * @buf: buffer to write
+ * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
+ *
+ * This function writes a data node to the journal. Returns %0 if the data node
+ * was successfully written, and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+ const union ubifs_key *key, const void *buf, int len)
+{
+ struct ubifs_data_node *data;
+ int err, lnum, offs, compr_type, out_len;
+ int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
+ key_block(c, key), len, DBGKEY(key));
+ ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+
+ data = kmalloc(dlen, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+
+ data->ch.node_type = UBIFS_DATA_NODE;
+ key_write(c, key, &data->key);
+ data->size = cpu_to_le32(len);
+ zero_data_node_unused(data);
+
+ if (!(ui->flags && UBIFS_COMPR_FL))
+ /* Compression is disabled for this inode */
+ compr_type = UBIFS_COMPR_NONE;
+ else
+ compr_type = ui->compr_type;
+
+ out_len = dlen - UBIFS_DATA_NODE_SZ;
+ ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
+ ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+
+ dlen = UBIFS_DATA_NODE_SZ + out_len;
+ data->compr_type = cpu_to_le16(compr_type);
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, DATAHD, dlen);
+ if (err)
+ goto out_free;
+
+ err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+ if (err)
+ goto out_release;
+ ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
+ release_head(c, DATAHD);
+
+ err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ kfree(data);
+ return 0;
+
+out_release:
+ release_head(c, DATAHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(data);
+ return err;
+}
+
+/**
+ * ubifs_jnl_write_inode - flush inode to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode to flush
+ * @deletion: inode has been deleted
+ *
+ * This function writes inode @inode to the journal. If the inode is
+ * synchronous, it also synchronizes the write-buffer. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+ int deletion)
+{
+ int err, len, lnum, offs, sync = 0;
+ struct ubifs_ino_node *ino;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ dbg_jnl("ino %lu%s", inode->i_ino,
+ deletion ? " (last reference)" : "");
+ if (deletion)
+ ubifs_assert(inode->i_nlink == 0);
+
+ len = UBIFS_INO_NODE_SZ;
+ /*
+ * If the inode is being deleted, do not write the attached data. No
+ * need to synchronize the write-buffer either.
+ */
+ if (!deletion) {
+ len += ui->data_len;
+ sync = IS_SYNC(inode);
+ }
+ ino = kmalloc(len, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ pack_inode(c, ino, inode, 1, deletion);
+ err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ inode->i_ino);
+ release_head(c, BASEHD);
+
+ if (deletion) {
+ err = ubifs_tnc_remove_ino(c, inode->i_ino);
+ if (err)
+ goto out_ro;
+ ubifs_delete_orphan(c, inode->i_ino);
+ err = ubifs_add_dirt(c, lnum, len);
+ } else {
+ union ubifs_key key;
+
+ ino_key_init(c, &key, inode->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, len);
+ }
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&ui->ui_lock);
+ ui->synced_i_size = ui->ui_size;
+ spin_unlock(&ui->ui_lock);
+ kfree(ino);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(ino);
+ return err;
+}
+
+/**
+ * ubifs_jnl_rename - rename a directory entry.
+ * @c: UBIFS file-system description object
+ * @old_dir: parent inode of directory entry to rename
+ * @old_dentry: directory entry to rename
+ * @new_dir: parent inode of directory entry to rename
+ * @new_dentry: new directory entry (or directory entry to replace)
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+ * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry, int sync)
+{
+ void *p;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent, *dent2;
+ int err, dlen1, dlen2, ilen, lnum, offs, len;
+ const struct inode *old_inode = old_dentry->d_inode;
+ const struct inode *new_inode = new_dentry->d_inode;
+ int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+ int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+ int move = (old_dir != new_dir);
+ struct ubifs_inode *uninitialized_var(new_ui);
+
+ dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ old_dir->i_ino, new_dentry->d_name.len,
+ new_dentry->d_name.name, new_dir->i_ino);
+ ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
+ ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
+ ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
+ ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
+
+ dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+ if (new_inode) {
+ new_ui = ubifs_inode(new_inode);
+ ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
+ ilen = UBIFS_INO_NODE_SZ;
+ if (!last_reference)
+ ilen += new_ui->data_len;
+ } else
+ ilen = 0;
+
+ aligned_dlen1 = ALIGN(dlen1, 8);
+ aligned_dlen2 = ALIGN(dlen2, 8);
+ len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+ if (old_dir != new_dir)
+ len += plen;
+ dent = kmalloc(len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ /* Make new dent */
+ dent->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+ dent->inum = cpu_to_le64(old_inode->i_ino);
+ dent->type = get_dent_type(old_inode->i_mode);
+ dent->nlen = cpu_to_le16(new_dentry->d_name.len);
+ memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
+ dent->name[new_dentry->d_name.len] = '\0';
+ zero_dent_node_unused(dent);
+ ubifs_prep_grp_node(c, dent, dlen1, 0);
+
+ /* Make deletion dent */
+ dent2 = (void *)dent + aligned_dlen1;
+ dent2->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
+ &old_dentry->d_name);
+ dent2->inum = 0;
+ dent2->type = DT_UNKNOWN;
+ dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
+ memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
+ dent2->name[old_dentry->d_name.len] = '\0';
+ zero_dent_node_unused(dent2);
+ ubifs_prep_grp_node(c, dent2, dlen2, 0);
+
+ p = (void *)dent2 + aligned_dlen2;
+ if (new_inode) {
+ pack_inode(c, p, new_inode, 0, last_reference);
+ p += ALIGN(ilen, 8);
+ }
+
+ if (!move)
+ pack_inode(c, p, old_dir, 1, 0);
+ else {
+ pack_inode(c, p, old_dir, 0, 0);
+ p += ALIGN(plen, 8);
+ pack_inode(c, p, new_dir, 1, 0);
+ }
+
+ if (last_reference) {
+ err = ubifs_add_orphan(c, new_inode->i_ino);
+ if (err) {
+ release_head(c, BASEHD);
+ goto out_finish;
+ }
+ }
+
+ err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
+ if (new_inode)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ new_inode->i_ino);
+ }
+ release_head(c, BASEHD);
+
+ dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+ if (err)
+ goto out_ro;
+
+ err = ubifs_add_dirt(c, lnum, dlen2);
+ if (err)
+ goto out_ro;
+
+ dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+ err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+ if (err)
+ goto out_ro;
+
+ offs += aligned_dlen1 + aligned_dlen2;
+ if (new_inode) {
+ ino_key_init(c, &key, new_inode->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+ if (err)
+ goto out_ro;
+ offs += ALIGN(ilen, 8);
+ }
+
+ ino_key_init(c, &key, old_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ if (err)
+ goto out_ro;
+
+ if (old_dir != new_dir) {
+ offs += ALIGN(plen, 8);
+ ino_key_init(c, &key, new_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ if (err)
+ goto out_ro;
+ }
+
+ finish_reservation(c);
+ if (new_inode) {
+ mark_inode_clean(c, new_ui);
+ spin_lock(&new_ui->ui_lock);
+ new_ui->synced_i_size = new_ui->ui_size;
+ spin_unlock(&new_ui->ui_lock);
+ }
+ mark_inode_clean(c, ubifs_inode(old_dir));
+ if (move)
+ mark_inode_clean(c, ubifs_inode(new_dir));
+ kfree(dent);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ if (last_reference)
+ ubifs_delete_orphan(c, new_inode->i_ino);
+out_finish:
+ finish_reservation(c);
+out_free:
+ kfree(dent);
+ return err;
+}
+
+/**
+ * recomp_data_node - re-compress a truncated data node.
+ * @dn: data node to re-compress
+ * @new_len: new length
+ *
+ * This function is used when an inode is truncated and the last data node of
+ * the inode has to be re-compressed and re-written.
+ */
+static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
+{
+ void *buf;
+ int err, len, compr_type, out_len;
+
+ out_len = le32_to_cpu(dn->size);
+ buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ compr_type = le16_to_cpu(dn->compr_type);
+ err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
+ if (err)
+ goto out;
+
+ ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
+ ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+ dn->compr_type = cpu_to_le16(compr_type);
+ dn->size = cpu_to_le32(*new_len);
+ *new_len = UBIFS_DATA_NODE_SZ + out_len;
+out:
+ kfree(buf);
+ return err;
+}
+
+/**
+ * ubifs_jnl_truncate - update the journal for a truncation.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @old_size: old size
+ * @new_size: new size
+ *
+ * When the size of a file decreases due to truncation, a truncation node is
+ * written, the journal tree is updated, and the last data block is re-written
+ * if it has been affected. The inode is also updated in order to synchronize
+ * the new inode size.
+ *
+ * This function marks the inode as clean and returns zero on success. In case
+ * of failure, a negative error code is returned.
+ */
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+ loff_t old_size, loff_t new_size)
+{
+ union ubifs_key key, to_key;
+ struct ubifs_ino_node *ino;
+ struct ubifs_trun_node *trun;
+ struct ubifs_data_node *uninitialized_var(dn);
+ int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ ino_t inum = inode->i_ino;
+ unsigned int blk;
+
+ dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
+ ubifs_assert(!ui->data_len);
+ ubifs_assert(S_ISREG(inode->i_mode));
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+
+ sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+ UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+ ino = kmalloc(sz, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ trun = (void *)ino + UBIFS_INO_NODE_SZ;
+ trun->ch.node_type = UBIFS_TRUN_NODE;
+ trun->inum = cpu_to_le32(inum);
+ trun->old_size = cpu_to_le64(old_size);
+ trun->new_size = cpu_to_le64(new_size);
+ zero_trun_node_unused(trun);
+
+ dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
+ if (dlen) {
+ /* Get last data block so it can be truncated */
+ dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
+ blk = new_size >> UBIFS_BLOCK_SHIFT;
+ data_key_init(c, &key, inum, blk);
+ dbg_jnl("last block key %s", DBGKEY(&key));
+ err = ubifs_tnc_lookup(c, &key, dn);
+ if (err == -ENOENT)
+ dlen = 0; /* Not found (so it is a hole) */
+ else if (err)
+ goto out_free;
+ else {
+ if (le32_to_cpu(dn->size) <= dlen)
+ dlen = 0; /* Nothing to do */
+ else {
+ int compr_type = le16_to_cpu(dn->compr_type);
+
+ if (compr_type != UBIFS_COMPR_NONE) {
+ err = recomp_data_node(dn, &dlen);
+ if (err)
+ goto out_free;
+ } else {
+ dn->size = cpu_to_le32(dlen);
+ dlen += UBIFS_DATA_NODE_SZ;
+ }
+ zero_data_node_unused(dn);
+ }
+ }
+ }
+
+ /* Must make reservation before allocating sequence numbers */
+ len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
+ if (dlen)
+ len += dlen;
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ pack_inode(c, ino, inode, 0, 0);
+ ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
+ if (dlen)
+ ubifs_prep_grp_node(c, dn, dlen, 1);
+
+ err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
+ release_head(c, BASEHD);
+
+ if (dlen) {
+ sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
+ err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+ if (err)
+ goto out_ro;
+ }
+
+ ino_key_init(c, &key, inum);
+ err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ bit = new_size & (UBIFS_BLOCK_SIZE - 1);
+ blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
+ data_key_init(c, &key, inum, blk);
+
+ bit = old_size & (UBIFS_BLOCK_SIZE - 1);
+ blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+ data_key_init(c, &to_key, inum, blk);
+
+ err = ubifs_tnc_remove_range(c, &key, &to_key);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&ui->ui_lock);
+ ui->synced_i_size = ui->ui_size;
+ spin_unlock(&ui->ui_lock);
+ mark_inode_clean(c, ui);
+ kfree(ino);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(ino);
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_XATTR
+
+/**
+ * ubifs_jnl_delete_xattr - delete an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @nm: extended attribute entry name
+ *
+ * This function delete an extended attribute which is very similar to
+ * un-linking regular files - it writes a deletion xentry, a deletion inode and
+ * updates the target inode. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+ const struct inode *inode, const struct qstr *nm)
+{
+ int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+ struct ubifs_dent_node *xent;
+ struct ubifs_ino_node *ino;
+ union ubifs_key xent_key, key1, key2;
+ int sync = IS_DIRSYNC(host);
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+
+ dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+ host->i_ino, inode->i_ino, nm->name,
+ ubifs_inode(inode)->data_len);
+ ubifs_assert(inode->i_nlink == 0);
+ ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+ /*
+ * Since we are deleting the inode, we do not bother to attach any data
+ * to it and assume its length is %UBIFS_INO_NODE_SZ.
+ */
+ xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ aligned_xlen = ALIGN(xlen, 8);
+ hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
+ len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
+
+ xent = kmalloc(len, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err) {
+ kfree(xent);
+ return err;
+ }
+
+ xent->ch.node_type = UBIFS_XENT_NODE;
+ xent_key_init(c, &xent_key, host->i_ino, nm);
+ key_write(c, &xent_key, xent->key);
+ xent->inum = 0;
+ xent->type = get_dent_type(inode->i_mode);
+ xent->nlen = cpu_to_le16(nm->len);
+ memcpy(xent->name, nm->name, nm->len);
+ xent->name[nm->len] = '\0';
+ zero_dent_node_unused(xent);
+ ubifs_prep_grp_node(c, xent, xlen, 0);
+
+ ino = (void *)xent + aligned_xlen;
+ pack_inode(c, ino, inode, 0, 1);
+ ino = (void *)ino + UBIFS_INO_NODE_SZ;
+ pack_inode(c, ino, host, 1, 0);
+
+ err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+ if (!sync && !err)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
+ release_head(c, BASEHD);
+ kfree(xent);
+ if (err)
+ goto out_ro;
+
+ /* Remove the extended attribute entry from TNC */
+ err = ubifs_tnc_remove_nm(c, &xent_key, nm);
+ if (err)
+ goto out_ro;
+ err = ubifs_add_dirt(c, lnum, xlen);
+ if (err)
+ goto out_ro;
+
+ /*
+ * Remove all nodes belonging to the extended attribute inode from TNC.
+ * Well, there actually must be only one node - the inode itself.
+ */
+ lowest_ino_key(c, &key1, inode->i_ino);
+ highest_ino_key(c, &key2, inode->i_ino);
+ err = ubifs_tnc_remove_range(c, &key1, &key2);
+ if (err)
+ goto out_ro;
+ err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ /* And update TNC with the new host inode position */
+ ino_key_init(c, &key1, host->i_ino);
+ err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&host_ui->ui_lock);
+ host_ui->synced_i_size = host_ui->ui_size;
+ spin_unlock(&host_ui->ui_lock);
+ mark_inode_clean(c, host_ui);
+ return 0;
+
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+ return err;
+}
+
+/**
+ * ubifs_jnl_change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @inode: extended attribute inode
+ * @host: host inode
+ *
+ * This function writes the updated version of an extended attribute inode and
+ * the host inode tho the journal (to the base head). The host inode is written
+ * after the extended attribute inode in order to guarantee that the extended
+ * attribute will be flushed when the inode is synchronized by 'fsync()' and
+ * consequently, the write-buffer is synchronized. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
+ const struct inode *host)
+{
+ int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
+ struct ubifs_inode *host_ui = ubifs_inode(inode);
+ struct ubifs_ino_node *ino;
+ union ubifs_key key;
+ int sync = IS_DIRSYNC(host);
+
+ dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
+ ubifs_assert(host->i_nlink > 0);
+ ubifs_assert(inode->i_nlink > 0);
+ ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+ len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
+ len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
+ aligned_len1 = ALIGN(len1, 8);
+ aligned_len = aligned_len1 + ALIGN(len2, 8);
+
+ ino = kmalloc(aligned_len, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, aligned_len);
+ if (err)
+ goto out_free;
+
+ pack_inode(c, ino, host, 0, 0);
+ pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+
+ err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
+ if (!sync && !err) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+ }
+ release_head(c, BASEHD);
+ if (err)
+ goto out_ro;
+
+ ino_key_init(c, &key, host->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+ if (err)
+ goto out_ro;
+
+ ino_key_init(c, &key, inode->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&host_ui->ui_lock);
+ host_ui->synced_i_size = host_ui->ui_size;
+ spin_unlock(&host_ui->ui_lock);
+ mark_inode_clean(c, host_ui);
+ kfree(ino);
+ return 0;
+
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(ino);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 00000000000..8f747600754
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,533 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This header contains various key-related definitions and helper function.
+ * UBIFS allows several key schemes, so we access key fields only via these
+ * helpers. At the moment only one key scheme is supported.
+ *
+ * Simple key scheme
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Keys are 64-bits long. First 32-bits are inode number (parent inode number
+ * in case of direntry key). Next 3 bits are node type. The last 29 bits are
+ * 4KiB offset in case of inode node, and direntry hash in case of a direntry
+ * node. We use "r5" hash borrowed from reiserfs.
+ */
+
+#ifndef __UBIFS_KEY_H__
+#define __UBIFS_KEY_H__
+
+/**
+ * key_r5_hash - R5 hash function (borrowed from reiserfs).
+ * @s: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_r5_hash(const char *s, int len)
+{
+ uint32_t a = 0;
+ const signed char *str = (const signed char *)s;
+
+ while (*str) {
+ a += *str << 4;
+ a += *str >> 4;
+ a *= 11;
+ str++;
+ }
+
+ a &= UBIFS_S_KEY_HASH_MASK;
+
+ /*
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir"
+ * marker.
+ */
+ if (unlikely(a >= 0 && a <= 2))
+ a += 3;
+ return a;
+}
+
+/**
+ * key_test_hash - testing hash function.
+ * @str: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_test_hash(const char *str, int len)
+{
+ uint32_t a = 0;
+
+ len = min_t(uint32_t, len, 4);
+ memcpy(&a, str, len);
+ a &= UBIFS_S_KEY_HASH_MASK;
+ if (unlikely(a >= 0 && a <= 2))
+ a += 3;
+ return a;
+}
+
+/**
+ * ino_key_init - initialize inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * ino_key_init_flash - initialize on-flash inode key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum)
+{
+ union ubifs_key *key = k;
+
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_ino_key - get the lowest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void lowest_ino_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = 0;
+}
+
+/**
+ * highest_ino_key - get the highest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_ino_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = 0xffffffff;
+}
+
+/**
+ * dent_key_init - initialize directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ const struct qstr *nm)
+{
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_hash - initialize directory entry key without re-calculating
+ * hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @hash: direntry name hash
+ */
+static inline void dent_key_init_hash(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ uint32_t hash)
+{
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_flash - initialize on-flash directory entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum, const struct qstr *nm)
+{
+ union ubifs_key *key = k;
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(hash |
+ (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_dent_key - get the lowest possible directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: parent inode number
+ */
+static inline void lowest_dent_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * xent_key_init - initialize extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ const struct qstr *nm)
+{
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_hash - initialize extended attribute entry key without
+ * re-calculating hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @hash: extended attribute entry name hash
+ */
+static inline void xent_key_init_hash(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ uint32_t hash)
+{
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_flash - initialize on-flash extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum, const struct qstr *nm)
+{
+ union ubifs_key *key = k;
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(hash |
+ (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_xent_key - get the lowest possible extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: host inode number
+ */
+static inline void lowest_xent_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * data_key_init - initialize data key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ unsigned int block)
+{
+ ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
+}
+
+/**
+ * data_key_init_flash - initialize on-flash data key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum, unsigned int block)
+{
+ union ubifs_key *key = k;
+
+ ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(block |
+ (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * trun_key_init - initialize truncation node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ *
+ * Note, UBIFS does not have truncation keys on the media and this function is
+ * only used for purposes of replay.
+ */
+static inline void trun_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type - get key type.
+ * @c: UBIFS file-system description object
+ * @key: key to get type of
+ */
+static inline int key_type(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type_flash - get type of a on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to get type of
+ */
+static inline int key_type_flash(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_inum - fetch inode number from key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return key->u32[0];
+}
+
+/**
+ * key_inum_flash - fetch inode number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->j32[0]);
+}
+
+/**
+ * key_hash - get directory entry hash.
+ * @c: UBIFS file-system description object
+ * @key: the key to get hash from
+ */
+static inline int key_hash(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_hash_flash - get directory entry hash from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get hash from
+ */
+static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_block - get data block number.
+ * @c: UBIFS file-system description object
+ * @key: the key to get the block number from
+ */
+static inline unsigned int key_block(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_block_flash - get data block number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get the block number from
+ */
+static inline unsigned int key_block_flash(const struct ubifs_info *c,
+ const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_read - transform a key to in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_read(const struct ubifs_info *c, const void *from,
+ union ubifs_key *to)
+{
+ const union ubifs_key *f = from;
+
+ to->u32[0] = le32_to_cpu(f->j32[0]);
+ to->u32[1] = le32_to_cpu(f->j32[1]);
+}
+
+/**
+ * key_write - transform a key from in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write(const struct ubifs_info *c,
+ const union ubifs_key *from, void *to)
+{
+ union ubifs_key *t = to;
+
+ t->j32[0] = cpu_to_le32(from->u32[0]);
+ t->j32[1] = cpu_to_le32(from->u32[1]);
+ memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * key_write_idx - transform a key from in-memory format for the index.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write_idx(const struct ubifs_info *c,
+ const union ubifs_key *from, void *to)
+{
+ union ubifs_key *t = to;
+
+ t->j32[0] = cpu_to_le32(from->u32[0]);
+ t->j32[1] = cpu_to_le32(from->u32[1]);
+}
+
+/**
+ * key_copy - copy a key.
+ * @c: UBIFS file-system description object
+ * @from: the key to copy from
+ * @to: the key to copy to
+ */
+static inline void key_copy(const struct ubifs_info *c,
+ const union ubifs_key *from, union ubifs_key *to)
+{
+ to->u64[0] = from->u64[0];
+}
+
+/**
+ * keys_cmp - compare keys.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %-1 if @key1 is less than
+ * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ */
+static inline int keys_cmp(const struct ubifs_info *c,
+ const union ubifs_key *key1,
+ const union ubifs_key *key2)
+{
+ if (key1->u32[0] < key2->u32[0])
+ return -1;
+ if (key1->u32[0] > key2->u32[0])
+ return 1;
+ if (key1->u32[1] < key2->u32[1])
+ return -1;
+ if (key1->u32[1] > key2->u32[1])
+ return 1;
+
+ return 0;
+}
+
+/**
+ * is_hash_key - is a key vulnerable to hash collisions.
+ * @c: UBIFS file-system description object
+ * @key: key
+ *
+ * This function returns %1 if @key is a hashed key or %0 otherwise.
+ */
+static inline int is_hash_key(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ int type = key_type(c, key);
+
+ return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
+}
+
+/**
+ * key_max_inode_size - get maximum file size allowed by current key format.
+ * @c: UBIFS file-system description object
+ */
+static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
+{
+ switch (c->key_fmt) {
+ case UBIFS_SIMPLE_KEY_FMT:
+ return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
+ default:
+ return 0;
+ }
+}
+#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 00000000000..36857b9ed59
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,805 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file is a part of UBIFS journal implementation and contains various
+ * functions which manipulate the log. The log is a fixed area on the flash
+ * which does not contain any data but refers to buds. The log is a part of the
+ * journal.
+ */
+
+#include "ubifs.h"
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_bud_bytes(struct ubifs_info *c);
+#else
+#define dbg_check_bud_bytes(c) 0
+#endif
+
+/**
+ * ubifs_search_bud - search bud LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This function searches bud LEB @lnum. Returns bud description object in case
+ * of success and %NULL if there is no bud with this LEB number.
+ */
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
+{
+ struct rb_node *p;
+ struct ubifs_bud *bud;
+
+ spin_lock(&c->buds_lock);
+ p = c->buds.rb_node;
+ while (p) {
+ bud = rb_entry(p, struct ubifs_bud, rb);
+ if (lnum < bud->lnum)
+ p = p->rb_left;
+ else if (lnum > bud->lnum)
+ p = p->rb_right;
+ else {
+ spin_unlock(&c->buds_lock);
+ return bud;
+ }
+ }
+ spin_unlock(&c->buds_lock);
+ return NULL;
+}
+
+/**
+ * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This functions returns the wbuf for @lnum or %NULL if there is not one.
+ */
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
+{
+ struct rb_node *p;
+ struct ubifs_bud *bud;
+ int jhead;
+
+ if (!c->jheads)
+ return NULL;
+
+ spin_lock(&c->buds_lock);
+ p = c->buds.rb_node;
+ while (p) {
+ bud = rb_entry(p, struct ubifs_bud, rb);
+ if (lnum < bud->lnum)
+ p = p->rb_left;
+ else if (lnum > bud->lnum)
+ p = p->rb_right;
+ else {
+ jhead = bud->jhead;
+ spin_unlock(&c->buds_lock);
+ return &c->jheads[jhead].wbuf;
+ }
+ }
+ spin_unlock(&c->buds_lock);
+ return NULL;
+}
+
+/**
+ * next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ */
+static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+ lnum += 1;
+ if (lnum > c->log_last)
+ lnum = UBIFS_LOG_LNUM;
+
+ return lnum;
+}
+
+/**
+ * empty_log_bytes - calculate amount of empty space in the log.
+ * @c: UBIFS file-system description object
+ */
+static inline long long empty_log_bytes(const struct ubifs_info *c)
+{
+ long long h, t;
+
+ h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
+ t = (long long)c->ltail_lnum * c->leb_size;
+
+ if (h >= t)
+ return c->log_bytes - h + t;
+ else
+ return t - h;
+}
+
+/**
+ * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
+ * @c: UBIFS file-system description object
+ * @bud: the bud to add
+ */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+ struct rb_node **p, *parent = NULL;
+ struct ubifs_bud *b;
+ struct ubifs_jhead *jhead;
+
+ spin_lock(&c->buds_lock);
+ p = &c->buds.rb_node;
+ while (*p) {
+ parent = *p;
+ b = rb_entry(parent, struct ubifs_bud, rb);
+ ubifs_assert(bud->lnum != b->lnum);
+ if (bud->lnum < b->lnum)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&bud->rb, parent, p);
+ rb_insert_color(&bud->rb, &c->buds);
+ if (c->jheads) {
+ jhead = &c->jheads[bud->jhead];
+ list_add_tail(&bud->list, &jhead->buds_list);
+ } else
+ ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+
+ /*
+ * Note, although this is a new bud, we anyway account this space now,
+ * before any data has been written to it, because this is about to
+ * guarantee fixed mount time, and this bud will anyway be read and
+ * scanned.
+ */
+ c->bud_bytes += c->leb_size - bud->start;
+
+ dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
+ bud->start, bud->jhead, c->bud_bytes);
+ spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_create_buds_lists - create journal head buds lists for remount rw.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_create_buds_lists(struct ubifs_info *c)
+{
+ struct rb_node *p;
+
+ spin_lock(&c->buds_lock);
+ p = rb_first(&c->buds);
+ while (p) {
+ struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
+ struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
+
+ list_add_tail(&bud->list, &jhead->buds_list);
+ p = rb_next(p);
+ }
+ spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_add_bud_to_log - add a new bud to the log.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head the bud belongs to
+ * @lnum: LEB number of the bud
+ * @offs: starting offset of the bud
+ *
+ * This function writes reference node for the new bud LEB @lnum it to the log,
+ * and adds it to the buds tress. It also makes sure that log size does not
+ * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
+ * %-EAGAIN if commit is required, and a negative error codes in case of
+ * failure.
+ */
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
+{
+ int err;
+ struct ubifs_bud *bud;
+ struct ubifs_ref_node *ref;
+
+ bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
+ if (!bud)
+ return -ENOMEM;
+ ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
+ if (!ref) {
+ kfree(bud);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&c->log_mutex);
+
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_unlock;
+ }
+
+ /* Make sure we have enough space in the log */
+ if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
+ dbg_log("not enough log space - %lld, required %d",
+ empty_log_bytes(c), c->min_log_bytes);
+ ubifs_commit_required(c);
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+
+ /*
+ * Make sure the the amount of space in buds will not exceed
+ * 'c->max_bud_bytes' limit, because we want to guarantee mount time
+ * limits.
+ *
+ * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
+ * because we are holding @c->log_mutex. All @c->bud_bytes take place
+ * when both @c->log_mutex and @c->bud_bytes are locked.
+ */
+ if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
+ dbg_log("bud bytes %lld (%lld max), require commit",
+ c->bud_bytes, c->max_bud_bytes);
+ ubifs_commit_required(c);
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+
+ /*
+ * If the journal is full enough - start background commit. Note, it is
+ * OK to read 'c->cmt_state' without spinlock because integer reads
+ * are atomic in the kernel.
+ */
+ if (c->bud_bytes >= c->bg_bud_bytes &&
+ c->cmt_state == COMMIT_RESTING) {
+ dbg_log("bud bytes %lld (%lld max), initiate BG commit",
+ c->bud_bytes, c->max_bud_bytes);
+ ubifs_request_bg_commit(c);
+ }
+
+ bud->lnum = lnum;
+ bud->start = offs;
+ bud->jhead = jhead;
+
+ ref->ch.node_type = UBIFS_REF_NODE;
+ ref->lnum = cpu_to_le32(bud->lnum);
+ ref->offs = cpu_to_le32(bud->start);
+ ref->jhead = cpu_to_le32(jhead);
+
+ if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
+ c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_offs = 0;
+ }
+
+ if (c->lhead_offs == 0) {
+ /* Must ensure next log LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->lhead_lnum);
+ if (err)
+ goto out_unlock;
+ }
+
+ if (bud->start == 0) {
+ /*
+ * Before writing the LEB reference which refers an empty LEB
+ * to the log, we have to make sure it is mapped, because
+ * otherwise we'd risk to refer an LEB with garbage in case of
+ * an unclean reboot, because the target LEB might have been
+ * unmapped, but not yet physically erased.
+ */
+ err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
+ if (err)
+ goto out_unlock;
+ }
+
+ dbg_log("write ref LEB %d:%d",
+ c->lhead_lnum, c->lhead_offs);
+ err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
+ c->lhead_offs, UBI_SHORTTERM);
+ if (err)
+ goto out_unlock;
+
+ c->lhead_offs += c->ref_node_alsz;
+
+ ubifs_add_bud(c, bud);
+
+ mutex_unlock(&c->log_mutex);
+ kfree(ref);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&c->log_mutex);
+ kfree(ref);
+ kfree(bud);
+ return err;
+}
+
+/**
+ * remove_buds - remove used buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function removes use buds from the buds tree. It does not remove the
+ * buds which are pointed to by journal heads.
+ */
+static void remove_buds(struct ubifs_info *c)
+{
+ struct rb_node *p;
+
+ ubifs_assert(list_empty(&c->old_buds));
+ c->cmt_bud_bytes = 0;
+ spin_lock(&c->buds_lock);
+ p = rb_first(&c->buds);
+ while (p) {
+ struct rb_node *p1 = p;
+ struct ubifs_bud *bud;
+ struct ubifs_wbuf *wbuf;
+
+ p = rb_next(p);
+ bud = rb_entry(p1, struct ubifs_bud, rb);
+ wbuf = &c->jheads[bud->jhead].wbuf;
+
+ if (wbuf->lnum == bud->lnum) {
+ /*
+ * Do not remove buds which are pointed to by journal
+ * heads (non-closed buds).
+ */
+ c->cmt_bud_bytes += wbuf->offs - bud->start;
+ dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+ "cmt_bud_bytes %lld", bud->lnum, bud->start,
+ bud->jhead, wbuf->offs - bud->start,
+ c->cmt_bud_bytes);
+ bud->start = wbuf->offs;
+ } else {
+ c->cmt_bud_bytes += c->leb_size - bud->start;
+ dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+ "cmt_bud_bytes %lld", bud->lnum, bud->start,
+ bud->jhead, c->leb_size - bud->start,
+ c->cmt_bud_bytes);
+ rb_erase(p1, &c->buds);
+ list_del(&bud->list);
+ /*
+ * If the commit does not finish, the recovery will need
+ * to replay the journal, in which case the old buds
+ * must be unchanged. Do not release them until post
+ * commit i.e. do not allow them to be garbage
+ * collected.
+ */
+ list_add(&bud->list, &c->old_buds);
+ }
+ }
+ spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_log_start_commit - start commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: return new log tail LEB number
+ *
+ * The commit operation starts with writing "commit start" node to the log and
+ * reference nodes for all journal heads which will define new journal after
+ * the commit has been finished. The commit start and reference nodes are
+ * written in one go to the nearest empty log LEB (hence, when commit is
+ * finished UBIFS may safely unmap all the previous log LEBs). This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
+{
+ void *buf;
+ struct ubifs_cs_node *cs;
+ struct ubifs_ref_node *ref;
+ int err, i, max_len, len;
+
+ err = dbg_check_bud_bytes(c);
+ if (err)
+ return err;
+
+ max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
+ max_len = ALIGN(max_len, c->min_io_size);
+ buf = cs = kmalloc(max_len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ cs->ch.node_type = UBIFS_CS_NODE;
+ cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+ ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+
+ /*
+ * Note, we do not lock 'c->log_mutex' because this is the commit start
+ * phase and we are exclusively using the log. And we do not lock
+ * write-buffer because nobody can write to the file-system at this
+ * phase.
+ */
+
+ len = UBIFS_CS_NODE_SZ;
+ for (i = 0; i < c->jhead_cnt; i++) {
+ int lnum = c->jheads[i].wbuf.lnum;
+ int offs = c->jheads[i].wbuf.offs;
+
+ if (lnum == -1 || offs == c->leb_size)
+ continue;
+
+ dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+ ref = buf + len;
+ ref->ch.node_type = UBIFS_REF_NODE;
+ ref->lnum = cpu_to_le32(lnum);
+ ref->offs = cpu_to_le32(offs);
+ ref->jhead = cpu_to_le32(i);
+
+ ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
+ len += UBIFS_REF_NODE_SZ;
+ }
+
+ ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
+
+ /* Switch to the next log LEB */
+ if (c->lhead_offs) {
+ c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_offs = 0;
+ }
+
+ if (c->lhead_offs == 0) {
+ /* Must ensure next LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->lhead_lnum);
+ if (err)
+ goto out;
+ }
+
+ len = ALIGN(len, c->min_io_size);
+ dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
+ err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+ if (err)
+ goto out;
+
+ *ltail_lnum = c->lhead_lnum;
+
+ c->lhead_offs += len;
+ if (c->lhead_offs == c->leb_size) {
+ c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_offs = 0;
+ }
+
+ remove_buds(c);
+
+ /*
+ * We have started the commit and now users may use the rest of the log
+ * for new writes.
+ */
+ c->min_log_bytes = 0;
+
+out:
+ kfree(buf);
+ return err;
+}
+
+/**
+ * ubifs_log_end_commit - end commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: new log tail LEB number
+ *
+ * This function is called on when the commit operation was finished. It
+ * moves log tail to new position and unmaps LEBs which contain obsolete data.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
+{
+ int err;
+
+ /*
+ * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
+ * writes during commit. Its only short "commit" start phase when
+ * writers are blocked.
+ */
+ mutex_lock(&c->log_mutex);
+
+ dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
+ c->ltail_lnum, ltail_lnum);
+
+ c->ltail_lnum = ltail_lnum;
+ /*
+ * The commit is finished and from now on it must be guaranteed that
+ * there is always enough space for the next commit.
+ */
+ c->min_log_bytes = c->leb_size;
+
+ spin_lock(&c->buds_lock);
+ c->bud_bytes -= c->cmt_bud_bytes;
+ spin_unlock(&c->buds_lock);
+
+ err = dbg_check_bud_bytes(c);
+
+ mutex_unlock(&c->log_mutex);
+ return err;
+}
+
+/**
+ * ubifs_log_post_commit - things to do after commit is completed.
+ * @c: UBIFS file-system description object
+ * @old_ltail_lnum: old log tail LEB number
+ *
+ * Release buds only after commit is completed, because they must be unchanged
+ * if recovery is needed.
+ *
+ * Unmap log LEBs only after commit is completed, because they may be needed for
+ * recovery.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
+{
+ int lnum, err = 0;
+
+ while (!list_empty(&c->old_buds)) {
+ struct ubifs_bud *bud;
+
+ bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+ err = ubifs_return_leb(c, bud->lnum);
+ if (err)
+ return err;
+ list_del(&bud->list);
+ kfree(bud);
+ }
+ mutex_lock(&c->log_mutex);
+ for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
+ lnum = next_log_lnum(c, lnum)) {
+ dbg_log("unmap log LEB %d", lnum);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ goto out;
+ }
+out:
+ mutex_unlock(&c->log_mutex);
+ return err;
+}
+
+/**
+ * struct done_ref - references that have been done.
+ * @rb: rb-tree node
+ * @lnum: LEB number
+ */
+struct done_ref {
+ struct rb_node rb;
+ int lnum;
+};
+
+/**
+ * done_already - determine if a reference has been done already.
+ * @done_tree: rb-tree to store references that have been done
+ * @lnum: LEB number of reference
+ *
+ * This function returns %1 if the reference has been done, %0 if not, otherwise
+ * a negative error code is returned.
+ */
+static int done_already(struct rb_root *done_tree, int lnum)
+{
+ struct rb_node **p = &done_tree->rb_node, *parent = NULL;
+ struct done_ref *dr;
+
+ while (*p) {
+ parent = *p;
+ dr = rb_entry(parent, struct done_ref, rb);
+ if (lnum < dr->lnum)
+ p = &(*p)->rb_left;
+ else if (lnum > dr->lnum)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+
+ dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
+ if (!dr)
+ return -ENOMEM;
+
+ dr->lnum = lnum;
+
+ rb_link_node(&dr->rb, parent, p);
+ rb_insert_color(&dr->rb, done_tree);
+
+ return 0;
+}
+
+/**
+ * destroy_done_tree - destroy the done tree.
+ * @done_tree: done tree to destroy
+ */
+static void destroy_done_tree(struct rb_root *done_tree)
+{
+ struct rb_node *this = done_tree->rb_node;
+ struct done_ref *dr;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ dr = rb_entry(this, struct done_ref, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &dr->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(dr);
+ }
+}
+
+/**
+ * add_node - add a node to the consolidated log.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to which to add
+ * @lnum: LEB number to which to write is passed and returned here
+ * @offs: offset to where to write is passed and returned here
+ * @node: node to add
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
+ void *node)
+{
+ struct ubifs_ch *ch = node;
+ int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
+
+ if (len > remains) {
+ int sz = ALIGN(*offs, c->min_io_size), err;
+
+ ubifs_pad(c, buf + *offs, sz - *offs);
+ err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+ if (err)
+ return err;
+ *lnum = next_log_lnum(c, *lnum);
+ *offs = 0;
+ }
+ memcpy(buf + *offs, node, len);
+ *offs += ALIGN(len, 8);
+ return 0;
+}
+
+/**
+ * ubifs_consolidate_log - consolidate the log.
+ * @c: UBIFS file-system description object
+ *
+ * Repeated failed commits could cause the log to be full, but at least 1 LEB is
+ * needed for commit. This function rewrites the reference nodes in the log
+ * omitting duplicates, and failed CS nodes, and leaving no gaps.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_consolidate_log(struct ubifs_info *c)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct rb_root done_tree = RB_ROOT;
+ int lnum, err, first = 1, write_lnum, offs = 0;
+ void *buf;
+
+ dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
+ c->lhead_lnum);
+ buf = vmalloc(c->leb_size);
+ if (!buf)
+ return -ENOMEM;
+ lnum = c->ltail_lnum;
+ write_lnum = lnum;
+ while (1) {
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb)) {
+ err = PTR_ERR(sleb);
+ goto out_free;
+ }
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ switch (snod->type) {
+ case UBIFS_REF_NODE: {
+ struct ubifs_ref_node *ref = snod->node;
+ int ref_lnum = le32_to_cpu(ref->lnum);
+
+ err = done_already(&done_tree, ref_lnum);
+ if (err < 0)
+ goto out_scan;
+ if (err != 1) {
+ err = add_node(c, buf, &write_lnum,
+ &offs, snod->node);
+ if (err)
+ goto out_scan;
+ }
+ break;
+ }
+ case UBIFS_CS_NODE:
+ if (!first)
+ break;
+ err = add_node(c, buf, &write_lnum, &offs,
+ snod->node);
+ if (err)
+ goto out_scan;
+ first = 0;
+ break;
+ }
+ }
+ ubifs_scan_destroy(sleb);
+ if (lnum == c->lhead_lnum)
+ break;
+ lnum = next_log_lnum(c, lnum);
+ }
+ if (offs) {
+ int sz = ALIGN(offs, c->min_io_size);
+
+ ubifs_pad(c, buf + offs, sz - offs);
+ err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+ if (err)
+ goto out_free;
+ offs = ALIGN(offs, c->min_io_size);
+ }
+ destroy_done_tree(&done_tree);
+ vfree(buf);
+ if (write_lnum == c->lhead_lnum) {
+ ubifs_err("log is too full");
+ return -EINVAL;
+ }
+ /* Unmap remaining LEBs */
+ lnum = write_lnum;
+ do {
+ lnum = next_log_lnum(c, lnum);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ } while (lnum != c->lhead_lnum);
+ c->lhead_lnum = write_lnum;
+ c->lhead_offs = offs;
+ dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
+ return 0;
+
+out_scan:
+ ubifs_scan_destroy(sleb);
+out_free:
+ destroy_done_tree(&done_tree);
+ vfree(buf);
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure the amount of flash space used by closed buds
+ * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
+ * case of failure.
+ */
+static int dbg_check_bud_bytes(struct ubifs_info *c)
+{
+ int i, err = 0;
+ struct ubifs_bud *bud;
+ long long bud_bytes = 0;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ spin_lock(&c->buds_lock);
+ for (i = 0; i < c->jhead_cnt; i++)
+ list_for_each_entry(bud, &c->jheads[i].buds_list, list)
+ bud_bytes += c->leb_size - bud->start;
+
+ if (c->bud_bytes != bud_bytes) {
+ ubifs_err("bad bud_bytes %lld, calculated %lld",
+ c->bud_bytes, bud_bytes);
+ err = -EINVAL;
+ }
+ spin_unlock(&c->buds_lock);
+
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 00000000000..2ba93da71b6
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1357 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the functions that access LEB properties and their
+ * categories. LEBs are categorized based on the needs of UBIFS, and the
+ * categories are stored as either heaps or lists to provide a fast way of
+ * finding a LEB in a particular category. For example, UBIFS may need to find
+ * an empty LEB for the journal, or a very dirty LEB for garbage collection.
+ */
+
+#include "ubifs.h"
+
+/**
+ * get_heap_comp_val - get the LEB properties value for heap comparisons.
+ * @lprops: LEB properties
+ * @cat: LEB category
+ */
+static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
+{
+ switch (cat) {
+ case LPROPS_FREE:
+ return lprops->free;
+ case LPROPS_DIRTY_IDX:
+ return lprops->free + lprops->dirty;
+ default:
+ return lprops->dirty;
+ }
+}
+
+/**
+ * move_up_lpt_heap - move a new heap entry up as far as possible.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @cat: LEB category
+ *
+ * New entries to a heap are added at the bottom and then moved up until the
+ * parent's value is greater. In the case of LPT's category heaps, the value
+ * is either the amount of free space or the amount of dirty space, depending
+ * on the category.
+ */
+static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+ struct ubifs_lprops *lprops, int cat)
+{
+ int val1, val2, hpos;
+
+ hpos = lprops->hpos;
+ if (!hpos)
+ return; /* Already top of the heap */
+ val1 = get_heap_comp_val(lprops, cat);
+ /* Compare to parent and, if greater, move up the heap */
+ do {
+ int ppos = (hpos - 1) / 2;
+
+ val2 = get_heap_comp_val(heap->arr[ppos], cat);
+ if (val2 >= val1)
+ return;
+ /* Greater than parent so move up */
+ heap->arr[ppos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[ppos];
+ heap->arr[ppos] = lprops;
+ lprops->hpos = ppos;
+ hpos = ppos;
+ } while (hpos);
+}
+
+/**
+ * adjust_lpt_heap - move a changed heap entry up or down the heap.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @hpos: heap position of @lprops
+ * @cat: LEB category
+ *
+ * Changed entries in a heap are moved up or down until the parent's value is
+ * greater. In the case of LPT's category heaps, the value is either the amount
+ * of free space or the amount of dirty space, depending on the category.
+ */
+static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+ struct ubifs_lprops *lprops, int hpos, int cat)
+{
+ int val1, val2, val3, cpos;
+
+ val1 = get_heap_comp_val(lprops, cat);
+ /* Compare to parent and, if greater than parent, move up the heap */
+ if (hpos) {
+ int ppos = (hpos - 1) / 2;
+
+ val2 = get_heap_comp_val(heap->arr[ppos], cat);
+ if (val1 > val2) {
+ /* Greater than parent so move up */
+ while (1) {
+ heap->arr[ppos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[ppos];
+ heap->arr[ppos] = lprops;
+ lprops->hpos = ppos;
+ hpos = ppos;
+ if (!hpos)
+ return;
+ ppos = (hpos - 1) / 2;
+ val2 = get_heap_comp_val(heap->arr[ppos], cat);
+ if (val1 <= val2)
+ return;
+ /* Still greater than parent so keep going */
+ }
+ }
+ }
+ /* Not greater than parent, so compare to children */
+ while (1) {
+ /* Compare to left child */
+ cpos = hpos * 2 + 1;
+ if (cpos >= heap->cnt)
+ return;
+ val2 = get_heap_comp_val(heap->arr[cpos], cat);
+ if (val1 < val2) {
+ /* Less than left child, so promote biggest child */
+ if (cpos + 1 < heap->cnt) {
+ val3 = get_heap_comp_val(heap->arr[cpos + 1],
+ cat);
+ if (val3 > val2)
+ cpos += 1; /* Right child is bigger */
+ }
+ heap->arr[cpos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[cpos];
+ heap->arr[cpos] = lprops;
+ lprops->hpos = cpos;
+ hpos = cpos;
+ continue;
+ }
+ /* Compare to right child */
+ cpos += 1;
+ if (cpos >= heap->cnt)
+ return;
+ val3 = get_heap_comp_val(heap->arr[cpos], cat);
+ if (val1 < val3) {
+ /* Less than right child, so promote right child */
+ heap->arr[cpos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[cpos];
+ heap->arr[cpos] = lprops;
+ lprops->hpos = cpos;
+ hpos = cpos;
+ continue;
+ }
+ return;
+ }
+}
+
+/**
+ * add_to_lpt_heap - add LEB properties to a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category
+ *
+ * This function returns %1 if @lprops is added to the heap for LEB category
+ * @cat, otherwise %0 is returned because the heap is full.
+ */
+static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
+ int cat)
+{
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+ if (heap->cnt >= heap->max_cnt) {
+ const int b = LPT_HEAP_SZ / 2 - 1;
+ int cpos, val1, val2;
+
+ /* Compare to some other LEB on the bottom of heap */
+ /* Pick a position kind of randomly */
+ cpos = (((size_t)lprops >> 4) & b) + b;
+ ubifs_assert(cpos >= b);
+ ubifs_assert(cpos < LPT_HEAP_SZ);
+ ubifs_assert(cpos < heap->cnt);
+
+ val1 = get_heap_comp_val(lprops, cat);
+ val2 = get_heap_comp_val(heap->arr[cpos], cat);
+ if (val1 > val2) {
+ struct ubifs_lprops *lp;
+
+ lp = heap->arr[cpos];
+ lp->flags &= ~LPROPS_CAT_MASK;
+ lp->flags |= LPROPS_UNCAT;
+ list_add(&lp->list, &c->uncat_list);
+ lprops->hpos = cpos;
+ heap->arr[cpos] = lprops;
+ move_up_lpt_heap(c, heap, lprops, cat);
+ dbg_check_heap(c, heap, cat, lprops->hpos);
+ return 1; /* Added to heap */
+ }
+ dbg_check_heap(c, heap, cat, -1);
+ return 0; /* Not added to heap */
+ } else {
+ lprops->hpos = heap->cnt++;
+ heap->arr[lprops->hpos] = lprops;
+ move_up_lpt_heap(c, heap, lprops, cat);
+ dbg_check_heap(c, heap, cat, lprops->hpos);
+ return 1; /* Added to heap */
+ }
+}
+
+/**
+ * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category
+ */
+static void remove_from_lpt_heap(struct ubifs_info *c,
+ struct ubifs_lprops *lprops, int cat)
+{
+ struct ubifs_lpt_heap *heap;
+ int hpos = lprops->hpos;
+
+ heap = &c->lpt_heap[cat - 1];
+ ubifs_assert(hpos >= 0 && hpos < heap->cnt);
+ ubifs_assert(heap->arr[hpos] == lprops);
+ heap->cnt -= 1;
+ if (hpos < heap->cnt) {
+ heap->arr[hpos] = heap->arr[heap->cnt];
+ heap->arr[hpos]->hpos = hpos;
+ adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
+ }
+ dbg_check_heap(c, heap, cat, -1);
+}
+
+/**
+ * lpt_heap_replace - replace lprops in a category heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ * @cat: LEB category
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * the category heaps to those lprops must be updated to point to the new
+ * lprops. This function does that.
+ */
+static void lpt_heap_replace(struct ubifs_info *c,
+ struct ubifs_lprops *old_lprops,
+ struct ubifs_lprops *new_lprops, int cat)
+{
+ struct ubifs_lpt_heap *heap;
+ int hpos = new_lprops->hpos;
+
+ heap = &c->lpt_heap[cat - 1];
+ heap->arr[hpos] = new_lprops;
+}
+
+/**
+ * ubifs_add_to_cat - add LEB properties to a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category to which to add
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+ int cat)
+{
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ if (add_to_lpt_heap(c, lprops, cat))
+ break;
+ /* No more room on heap so make it uncategorized */
+ cat = LPROPS_UNCAT;
+ /* Fall through */
+ case LPROPS_UNCAT:
+ list_add(&lprops->list, &c->uncat_list);
+ break;
+ case LPROPS_EMPTY:
+ list_add(&lprops->list, &c->empty_list);
+ break;
+ case LPROPS_FREEABLE:
+ list_add(&lprops->list, &c->freeable_list);
+ c->freeable_cnt += 1;
+ break;
+ case LPROPS_FRDI_IDX:
+ list_add(&lprops->list, &c->frdi_idx_list);
+ break;
+ default:
+ ubifs_assert(0);
+ }
+ lprops->flags &= ~LPROPS_CAT_MASK;
+ lprops->flags |= cat;
+}
+
+/**
+ * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category from which to remove
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+static void ubifs_remove_from_cat(struct ubifs_info *c,
+ struct ubifs_lprops *lprops, int cat)
+{
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ remove_from_lpt_heap(c, lprops, cat);
+ break;
+ case LPROPS_FREEABLE:
+ c->freeable_cnt -= 1;
+ ubifs_assert(c->freeable_cnt >= 0);
+ /* Fall through */
+ case LPROPS_UNCAT:
+ case LPROPS_EMPTY:
+ case LPROPS_FRDI_IDX:
+ ubifs_assert(!list_empty(&lprops->list));
+ list_del(&lprops->list);
+ break;
+ default:
+ ubifs_assert(0);
+ }
+}
+
+/**
+ * ubifs_replace_cat - replace lprops in a category list or heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+ struct ubifs_lprops *new_lprops)
+{
+ int cat;
+
+ cat = new_lprops->flags & LPROPS_CAT_MASK;
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ lpt_heap_replace(c, old_lprops, new_lprops, cat);
+ break;
+ case LPROPS_UNCAT:
+ case LPROPS_EMPTY:
+ case LPROPS_FREEABLE:
+ case LPROPS_FRDI_IDX:
+ list_replace(&old_lprops->list, &new_lprops->list);
+ break;
+ default:
+ ubifs_assert(0);
+ }
+}
+
+/**
+ * ubifs_ensure_cat - ensure LEB properties are categorized.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * A LEB may have fallen off of the bottom of a heap, and ended up as
+ * uncategorized even though it has enough space for us now. If that is the case
+ * this function will put the LEB back onto a heap.
+ */
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+ int cat = lprops->flags & LPROPS_CAT_MASK;
+
+ if (cat != LPROPS_UNCAT)
+ return;
+ cat = ubifs_categorize_lprops(c, lprops);
+ if (cat == LPROPS_UNCAT)
+ return;
+ ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
+ ubifs_add_to_cat(c, lprops, cat);
+}
+
+/**
+ * ubifs_categorize_lprops - categorize LEB properties.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to categorize
+ *
+ * LEB properties are categorized to enable fast find operations. This function
+ * returns the LEB category to which the LEB properties belong. Note however
+ * that if the LEB category is stored as a heap and the heap is full, the
+ * LEB properties may have their category changed to %LPROPS_UNCAT.
+ */
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+ const struct ubifs_lprops *lprops)
+{
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPROPS_UNCAT;
+
+ if (lprops->free == c->leb_size) {
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return LPROPS_EMPTY;
+ }
+
+ if (lprops->free + lprops->dirty == c->leb_size) {
+ if (lprops->flags & LPROPS_INDEX)
+ return LPROPS_FRDI_IDX;
+ else
+ return LPROPS_FREEABLE;
+ }
+
+ if (lprops->flags & LPROPS_INDEX) {
+ if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
+ return LPROPS_DIRTY_IDX;
+ } else {
+ if (lprops->dirty >= c->dead_wm &&
+ lprops->dirty > lprops->free)
+ return LPROPS_DIRTY;
+ if (lprops->free > 0)
+ return LPROPS_FREE;
+ }
+
+ return LPROPS_UNCAT;
+}
+
+/**
+ * change_category - change LEB properties category.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to recategorize
+ *
+ * LEB properties are categorized to enable fast find operations. When the LEB
+ * properties change they must be recategorized.
+ */
+static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+ int old_cat = lprops->flags & LPROPS_CAT_MASK;
+ int new_cat = ubifs_categorize_lprops(c, lprops);
+
+ if (old_cat == new_cat) {
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+
+ /* lprops on a heap now must be moved up or down */
+ if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
+ return; /* Not on a heap */
+ heap = &c->lpt_heap[new_cat - 1];
+ adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
+ } else {
+ ubifs_remove_from_cat(c, lprops, old_cat);
+ ubifs_add_to_cat(c, lprops, new_cat);
+ }
+}
+
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+void ubifs_get_lprops(struct ubifs_info *c)
+{
+ mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * calc_dark - calculate LEB dark space size.
+ * @c: the UBIFS file-system description object
+ * @spc: amount of free and dirty space in the LEB
+ *
+ * This function calculates amount of dark space in an LEB which has @spc bytes
+ * of free and dirty space. Returns the calculations result.
+ *
+ * Dark space is the space which is not always usable - it depends on which
+ * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
+ * it is dark space, because it cannot fit a large data node. So UBIFS cannot
+ * count on this LEB and treat these 512 bytes as usable because it is not true
+ * if, for example, only big chunks of uncompressible data will be written to
+ * the FS.
+ */
+static int calc_dark(struct ubifs_info *c, int spc)
+{
+ ubifs_assert(!(spc & 7));
+
+ if (spc < c->dark_wm)
+ return spc;
+
+ /*
+ * If we have slightly more space then the dark space watermark, we can
+ * anyway safely assume it we'll be able to write a node of the
+ * smallest size there.
+ */
+ if (spc - c->dark_wm < MIN_WRITE_SZ)
+ return spc - MIN_WRITE_SZ;
+
+ return c->dark_wm;
+}
+
+/**
+ * is_lprops_dirty - determine if LEB properties are dirty.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to test
+ */
+static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+ struct ubifs_pnode *pnode;
+ int pos;
+
+ pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
+ pnode = (struct ubifs_pnode *)container_of(lprops - pos,
+ struct ubifs_pnode,
+ lprops[0]);
+ return !test_bit(COW_ZNODE, &pnode->flags) &&
+ test_bit(DIRTY_CNODE, &pnode->flags);
+}
+
+/**
+ * ubifs_change_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to change
+ * @free: new free space amount
+ * @dirty: new dirty space amount
+ * @flags: new flags
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes LEB properties. This function does not change a LEB
+ * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ *
+ * This function returns a pointer to the updated LEB properties on success
+ * and a negative error code on failure. N.B. the LEB properties may have had to
+ * be copied (due to COW) and consequently the pointer returned may not be the
+ * same as the pointer passed.
+ */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+ const struct ubifs_lprops *lp,
+ int free, int dirty, int flags,
+ int idx_gc_cnt)
+{
+ /*
+ * This is the only function that is allowed to change lprops, so we
+ * discard the const qualifier.
+ */
+ struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
+
+ dbg_lp("LEB %d, free %d, dirty %d, flags %d",
+ lprops->lnum, free, dirty, flags);
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+ ubifs_assert(c->lst.empty_lebs >= 0 &&
+ c->lst.empty_lebs <= c->main_lebs);
+ ubifs_assert(c->freeable_cnt >= 0);
+ ubifs_assert(c->freeable_cnt <= c->main_lebs);
+ ubifs_assert(c->lst.taken_empty_lebs >= 0);
+ ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
+ ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
+ ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
+ ubifs_assert(!(c->lst.total_used & 7));
+ ubifs_assert(free == LPROPS_NC || free >= 0);
+ ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
+
+ if (!is_lprops_dirty(c, lprops)) {
+ lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ } else
+ ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
+
+ ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
+
+ spin_lock(&c->space_lock);
+
+ if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+ c->lst.taken_empty_lebs -= 1;
+
+ if (!(lprops->flags & LPROPS_INDEX)) {
+ int old_spc;
+
+ old_spc = lprops->free + lprops->dirty;
+ if (old_spc < c->dead_wm)
+ c->lst.total_dead -= old_spc;
+ else
+ c->lst.total_dark -= calc_dark(c, old_spc);
+
+ c->lst.total_used -= c->leb_size - old_spc;
+ }
+
+ if (free != LPROPS_NC) {
+ free = ALIGN(free, 8);
+ c->lst.total_free += free - lprops->free;
+
+ /* Increase or decrease empty LEBs counter if needed */
+ if (free == c->leb_size) {
+ if (lprops->free != c->leb_size)
+ c->lst.empty_lebs += 1;
+ } else if (lprops->free == c->leb_size)
+ c->lst.empty_lebs -= 1;
+ lprops->free = free;
+ }
+
+ if (dirty != LPROPS_NC) {
+ dirty = ALIGN(dirty, 8);
+ c->lst.total_dirty += dirty - lprops->dirty;
+ lprops->dirty = dirty;
+ }
+
+ if (flags != LPROPS_NC) {
+ /* Take care about indexing LEBs counter if needed */
+ if ((lprops->flags & LPROPS_INDEX)) {
+ if (!(flags & LPROPS_INDEX))
+ c->lst.idx_lebs -= 1;
+ } else if (flags & LPROPS_INDEX)
+ c->lst.idx_lebs += 1;
+ lprops->flags = flags;
+ }
+
+ if (!(lprops->flags & LPROPS_INDEX)) {
+ int new_spc;
+
+ new_spc = lprops->free + lprops->dirty;
+ if (new_spc < c->dead_wm)
+ c->lst.total_dead += new_spc;
+ else
+ c->lst.total_dark += calc_dark(c, new_spc);
+
+ c->lst.total_used += c->leb_size - new_spc;
+ }
+
+ if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+ c->lst.taken_empty_lebs += 1;
+
+ change_category(c, lprops);
+
+ c->idx_gc_cnt += idx_gc_cnt;
+
+ spin_unlock(&c->space_lock);
+
+ return lprops;
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+void ubifs_release_lprops(struct ubifs_info *c)
+{
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+ ubifs_assert(c->lst.empty_lebs >= 0 &&
+ c->lst.empty_lebs <= c->main_lebs);
+
+ mutex_unlock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_get_lp_stats - get lprops statistics.
+ * @c: UBIFS file-system description object
+ * @st: return statistics
+ */
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+{
+ spin_lock(&c->space_lock);
+ memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+ spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_change_one_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes properties of LEB @lnum. It is a helper wrapper over
+ * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
+ * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean, int idx_gc_cnt)
+{
+ int err = 0, flags;
+ const struct ubifs_lprops *lp;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ flags = (lp->flags | flags_set) & ~flags_clean;
+ lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
+ if (IS_ERR(lp))
+ err = PTR_ERR(lp);
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_update_one_lp - update LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ *
+ * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
+ * current dirty space, not substitutes it.
+ */
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean)
+{
+ int err = 0, flags;
+ const struct ubifs_lprops *lp;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ flags = (lp->flags | flags_set) & ~flags_clean;
+ lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
+ if (IS_ERR(lp))
+ err = PTR_ERR(lp);
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_read_one_lp - read LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to read properties for
+ * @lp: where to store read properties
+ *
+ * This helper function reads properties of a LEB @lnum and stores them in @lp.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
+{
+ int err = 0;
+ const struct ubifs_lprops *lpp;
+
+ ubifs_get_lprops(c);
+
+ lpp = ubifs_lpt_lookup(c, lnum);
+ if (IS_ERR(lpp)) {
+ err = PTR_ERR(lpp);
+ goto out;
+ }
+
+ memcpy(lp, lpp, sizeof(struct ubifs_lprops));
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_fast_find_free - try to find a LEB with free space quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a LEB with free space or %NULL if
+ * the function is unable to find a LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ if (heap->cnt == 0)
+ return NULL;
+
+ lprops = heap->arr[0];
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_fast_find_empty - try to find an empty LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for an empty LEB or %NULL if the
+ * function is unable to find an empty LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ if (list_empty(&c->empty_list))
+ return NULL;
+
+ lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ ubifs_assert(lprops->free == c->leb_size);
+ return lprops;
+}
+
+/**
+ * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable LEB or %NULL if the
+ * function is unable to find a freeable LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ if (list_empty(&c->freeable_list))
+ return NULL;
+
+ lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+ ubifs_assert(c->freeable_cnt > 0);
+ return lprops;
+}
+
+/**
+ * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable index LEB or %NULL if the
+ * function is unable to find a freeable index LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ if (list_empty(&c->frdi_idx_list))
+ return NULL;
+
+ lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert((lprops->flags & LPROPS_INDEX));
+ ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+ return lprops;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_cats - check category heaps and lists.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_cats(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct list_head *pos;
+ int i, cat;
+
+ if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+ return 0;
+
+ list_for_each_entry(lprops, &c->empty_list, list) {
+ if (lprops->free != c->leb_size) {
+ ubifs_err("non-empty LEB %d on empty list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB %d on empty list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ }
+
+ i = 0;
+ list_for_each_entry(lprops, &c->freeable_list, list) {
+ if (lprops->free + lprops->dirty != c->leb_size) {
+ ubifs_err("non-freeable LEB %d on freeable list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB %d on freeable list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ i += 1;
+ }
+ if (i != c->freeable_cnt) {
+ ubifs_err("freeable list count %d expected %d", i,
+ c->freeable_cnt);
+ return -EINVAL;
+ }
+
+ i = 0;
+ list_for_each(pos, &c->idx_gc)
+ i += 1;
+ if (i != c->idx_gc_cnt) {
+ ubifs_err("idx_gc list count %d expected %d", i,
+ c->idx_gc_cnt);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+ if (lprops->free + lprops->dirty != c->leb_size) {
+ ubifs_err("non-freeable LEB %d on frdi_idx list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB %d on frdi_idx list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (!(lprops->flags & LPROPS_INDEX)) {
+ ubifs_err("non-index LEB %d on frdi_idx list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ }
+
+ for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ if (!lprops) {
+ ubifs_err("null ptr in LPT heap cat %d", cat);
+ return -EINVAL;
+ }
+ if (lprops->hpos != i) {
+ ubifs_err("bad ptr in LPT heap cat %d", cat);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB in LPT heap cat %d", cat);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+ int add_pos)
+{
+ int i = 0, j, err = 0;
+
+ if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+ return;
+
+ for (i = 0; i < heap->cnt; i++) {
+ struct ubifs_lprops *lprops = heap->arr[i];
+ struct ubifs_lprops *lp;
+
+ if (i != add_pos)
+ if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
+ err = 1;
+ goto out;
+ }
+ if (lprops->hpos != i) {
+ err = 2;
+ goto out;
+ }
+ lp = ubifs_lpt_lookup(c, lprops->lnum);
+ if (IS_ERR(lp)) {
+ err = 3;
+ goto out;
+ }
+ if (lprops != lp) {
+ dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+ (size_t)lprops, (size_t)lp, lprops->lnum,
+ lp->lnum);
+ err = 4;
+ goto out;
+ }
+ for (j = 0; j < i; j++) {
+ lp = heap->arr[j];
+ if (lp == lprops) {
+ err = 5;
+ goto out;
+ }
+ if (lp->lnum == lprops->lnum) {
+ err = 6;
+ goto out;
+ }
+ }
+ }
+out:
+ if (err) {
+ dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
+ dbg_dump_stack();
+ dbg_dump_heap(c, heap, cat);
+ }
+}
+
+/**
+ * struct scan_check_data - data provided to scan callback function.
+ * @lst: LEB properties statistics
+ * @err: error code
+ */
+struct scan_check_data {
+ struct ubifs_lp_stats lst;
+ int err;
+};
+
+/**
+ * scan_check_cb - scan callback.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_check_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lp, int in_tree,
+ struct scan_check_data *data)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct ubifs_lp_stats *lst = &data->lst;
+ int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+
+ cat = lp->flags & LPROPS_CAT_MASK;
+ if (cat != LPROPS_UNCAT) {
+ cat = ubifs_categorize_lprops(c, lp);
+ if (cat != (lp->flags & LPROPS_CAT_MASK)) {
+ ubifs_err("bad LEB category %d expected %d",
+ (lp->flags & LPROPS_CAT_MASK), cat);
+ goto out;
+ }
+ }
+
+ /* Check lp is on its category list (if it has one) */
+ if (in_tree) {
+ struct list_head *list = NULL;
+
+ switch (cat) {
+ case LPROPS_EMPTY:
+ list = &c->empty_list;
+ break;
+ case LPROPS_FREEABLE:
+ list = &c->freeable_list;
+ break;
+ case LPROPS_FRDI_IDX:
+ list = &c->frdi_idx_list;
+ break;
+ case LPROPS_UNCAT:
+ list = &c->uncat_list;
+ break;
+ }
+ if (list) {
+ struct ubifs_lprops *lprops;
+ int found = 0;
+
+ list_for_each_entry(lprops, list, list) {
+ if (lprops == lp) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ ubifs_err("bad LPT list (category %d)", cat);
+ goto out;
+ }
+ }
+ }
+
+ /* Check lp is on its category heap (if it has one) */
+ if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+ if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
+ lp != heap->arr[lp->hpos]) {
+ ubifs_err("bad LPT heap (category %d)", cat);
+ goto out;
+ }
+ }
+
+ sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ if (IS_ERR(sleb)) {
+ /*
+ * After an unclean unmount, empty and freeable LEBs
+ * may contain garbage.
+ */
+ if (lp->free == c->leb_size) {
+ ubifs_err("scan errors were in empty LEB "
+ "- continuing checking");
+ lst->empty_lebs += 1;
+ lst->total_free += c->leb_size;
+ lst->total_dark += calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
+ }
+
+ if (lp->free + lp->dirty == c->leb_size &&
+ !(lp->flags & LPROPS_INDEX)) {
+ ubifs_err("scan errors were in freeable LEB "
+ "- continuing checking");
+ lst->total_free += lp->free;
+ lst->total_dirty += lp->dirty;
+ lst->total_dark += calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
+ }
+ data->err = PTR_ERR(sleb);
+ return LPT_SCAN_STOP;
+ }
+
+ is_idx = -1;
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ int found, level = 0;
+
+ cond_resched();
+
+ if (is_idx == -1)
+ is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
+
+ if (is_idx && snod->type != UBIFS_IDX_NODE) {
+ ubifs_err("indexing node in data LEB %d:%d",
+ lnum, snod->offs);
+ goto out_destroy;
+ }
+
+ if (snod->type == UBIFS_IDX_NODE) {
+ struct ubifs_idx_node *idx = snod->node;
+
+ key_read(c, ubifs_idx_key(c, idx), &snod->key);
+ level = le16_to_cpu(idx->level);
+ }
+
+ found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
+ snod->offs, is_idx);
+ if (found) {
+ if (found < 0)
+ goto out_destroy;
+ used += ALIGN(snod->len, 8);
+ }
+ }
+
+ free = c->leb_size - sleb->endpt;
+ dirty = sleb->endpt - used;
+
+ if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
+ dirty < 0) {
+ ubifs_err("bad calculated accounting for LEB %d: "
+ "free %d, dirty %d", lnum, free, dirty);
+ goto out_destroy;
+ }
+
+ if (lp->free + lp->dirty == c->leb_size &&
+ free + dirty == c->leb_size)
+ if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
+ (!is_idx && free == c->leb_size) ||
+ lp->free == c->leb_size) {
+ /*
+ * Empty or freeable LEBs could contain index
+ * nodes from an uncompleted commit due to an
+ * unclean unmount. Or they could be empty for
+ * the same reason. Or it may simply not have been
+ * unmapped.
+ */
+ free = lp->free;
+ dirty = lp->dirty;
+ is_idx = 0;
+ }
+
+ if (is_idx && lp->free + lp->dirty == free + dirty &&
+ lnum != c->ihead_lnum) {
+ /*
+ * After an unclean unmount, an index LEB could have a different
+ * amount of free space than the value recorded by lprops. That
+ * is because the in-the-gaps method may use free space or
+ * create free space (as a side-effect of using ubi_leb_change
+ * and not writing the whole LEB). The incorrect free space
+ * value is not a problem because the index is only ever
+ * allocated empty LEBs, so there will never be an attempt to
+ * write to the free space at the end of an index LEB - except
+ * by the in-the-gaps method for which it is not a problem.
+ */
+ free = lp->free;
+ dirty = lp->dirty;
+ }
+
+ if (lp->free != free || lp->dirty != dirty)
+ goto out_print;
+
+ if (is_idx && !(lp->flags & LPROPS_INDEX)) {
+ if (free == c->leb_size)
+ /* Free but not unmapped LEB, it's fine */
+ is_idx = 0;
+ else {
+ ubifs_err("indexing node without indexing "
+ "flag");
+ goto out_print;
+ }
+ }
+
+ if (!is_idx && (lp->flags & LPROPS_INDEX)) {
+ ubifs_err("data node with indexing flag");
+ goto out_print;
+ }
+
+ if (free == c->leb_size)
+ lst->empty_lebs += 1;
+
+ if (is_idx)
+ lst->idx_lebs += 1;
+
+ if (!(lp->flags & LPROPS_INDEX))
+ lst->total_used += c->leb_size - free - dirty;
+ lst->total_free += free;
+ lst->total_dirty += dirty;
+
+ if (!(lp->flags & LPROPS_INDEX)) {
+ int spc = free + dirty;
+
+ if (spc < c->dead_wm)
+ lst->total_dead += spc;
+ else
+ lst->total_dark += calc_dark(c, spc);
+ }
+
+ ubifs_scan_destroy(sleb);
+
+ return LPT_SCAN_CONTINUE;
+
+out_print:
+ ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
+ "should be free %d, dirty %d",
+ lnum, lp->free, lp->dirty, lp->flags, free, dirty);
+ dbg_dump_leb(c, lnum);
+out_destroy:
+ ubifs_scan_destroy(sleb);
+out:
+ data->err = -EINVAL;
+ return LPT_SCAN_STOP;
+}
+
+/**
+ * dbg_check_lprops - check all LEB properties.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks all LEB properties and makes sure they are all correct.
+ * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
+ * and other negative error codes in case of other errors. This function is
+ * called while the file system is locked (because of commit start), so no
+ * additional locking is required. Note that locking the LPT mutex would cause
+ * a circular lock dependency with the TNC mutex.
+ */
+int dbg_check_lprops(struct ubifs_info *c)
+{
+ int i, err;
+ struct scan_check_data data;
+ struct ubifs_lp_stats *lst = &data.lst;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
+ /*
+ * As we are going to scan the media, the write buffers have to be
+ * synchronized.
+ */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ return err;
+ }
+
+ memset(lst, 0, sizeof(struct ubifs_lp_stats));
+
+ data.err = 0;
+ err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
+ (ubifs_lpt_scan_callback)scan_check_cb,
+ &data);
+ if (err && err != -ENOSPC)
+ goto out;
+ if (data.err) {
+ err = data.err;
+ goto out;
+ }
+
+ if (lst->empty_lebs != c->lst.empty_lebs ||
+ lst->idx_lebs != c->lst.idx_lebs ||
+ lst->total_free != c->lst.total_free ||
+ lst->total_dirty != c->lst.total_dirty ||
+ lst->total_used != c->lst.total_used) {
+ ubifs_err("bad overall accounting");
+ ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
+ "total_free %lld, total_dirty %lld, total_used %lld",
+ lst->empty_lebs, lst->idx_lebs, lst->total_free,
+ lst->total_dirty, lst->total_used);
+ ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
+ "total_free %lld, total_dirty %lld, total_used %lld",
+ c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
+ c->lst.total_dirty, c->lst.total_used);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (lst->total_dead != c->lst.total_dead ||
+ lst->total_dark != c->lst.total_dark) {
+ ubifs_err("bad dead/dark space accounting");
+ ubifs_err("calculated: total_dead %lld, total_dark %lld",
+ lst->total_dead, lst->total_dark);
+ ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
+ c->lst.total_dead, c->lst.total_dark);
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = dbg_check_cats(c);
+out:
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 00000000000..9ff2463177e
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2243 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the LEB properties tree (LPT) area. The LPT area
+ * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
+ * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
+ * between the log and the orphan area.
+ *
+ * The LPT area is like a miniature self-contained file system. It is required
+ * that it never runs out of space, is fast to access and update, and scales
+ * logarithmically. The LEB properties tree is implemented as a wandering tree
+ * much like the TNC, and the LPT area has its own garbage collection.
+ *
+ * The LPT has two slightly different forms called the "small model" and the
+ * "big model". The small model is used when the entire LEB properties table
+ * can be written into a single eraseblock. In that case, garbage collection
+ * consists of just writing the whole table, which therefore makes all other
+ * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
+ * selected for garbage collection, which consists are marking the nodes in
+ * that LEB as dirty, and then only the dirty nodes are written out. Also, in
+ * the case of the big model, a table of LEB numbers is saved so that the entire
+ * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
+ * mounted.
+ */
+
+#include <linux/crc16.h>
+#include "ubifs.h"
+
+/**
+ * do_calc_lpt_geom - calculate sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
+ * properties of the flash and whether LPT is "big" (c->big_lpt).
+ */
+static void do_calc_lpt_geom(struct ubifs_info *c)
+{
+ int i, n, bits, per_leb_wastage, max_pnode_cnt;
+ long long sz, tot_wastage;
+
+ n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
+ max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+
+ c->lpt_hght = 1;
+ n = UBIFS_LPT_FANOUT;
+ while (n < max_pnode_cnt) {
+ c->lpt_hght += 1;
+ n <<= UBIFS_LPT_FANOUT_SHIFT;
+ }
+
+ c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+
+ n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
+ c->nnode_cnt = n;
+ for (i = 1; i < c->lpt_hght; i++) {
+ n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+ c->nnode_cnt += n;
+ }
+
+ c->space_bits = fls(c->leb_size) - 3;
+ c->lpt_lnum_bits = fls(c->lpt_lebs);
+ c->lpt_offs_bits = fls(c->leb_size - 1);
+ c->lpt_spc_bits = fls(c->leb_size);
+
+ n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
+ c->pcnt_bits = fls(n - 1);
+
+ c->lnum_bits = fls(c->max_leb_cnt - 1);
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ (c->big_lpt ? c->pcnt_bits : 0) +
+ (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
+ c->pnode_sz = (bits + 7) / 8;
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ (c->big_lpt ? c->pcnt_bits : 0) +
+ (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
+ c->nnode_sz = (bits + 7) / 8;
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ c->lpt_lebs * c->lpt_spc_bits * 2;
+ c->ltab_sz = (bits + 7) / 8;
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ c->lnum_bits * c->lsave_cnt;
+ c->lsave_sz = (bits + 7) / 8;
+
+ /* Calculate the minimum LPT size */
+ c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+ c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+ c->lpt_sz += c->ltab_sz;
+ c->lpt_sz += c->lsave_sz;
+
+ /* Add wastage */
+ sz = c->lpt_sz;
+ per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
+ sz += per_leb_wastage;
+ tot_wastage = per_leb_wastage;
+ while (sz > c->leb_size) {
+ sz += per_leb_wastage;
+ sz -= c->leb_size;
+ tot_wastage += per_leb_wastage;
+ }
+ tot_wastage += ALIGN(sz, c->min_io_size) - sz;
+ c->lpt_sz += tot_wastage;
+}
+
+/**
+ * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_calc_lpt_geom(struct ubifs_info *c)
+{
+ int lebs_needed;
+ uint64_t sz;
+
+ do_calc_lpt_geom(c);
+
+ /* Verify that lpt_lebs is big enough */
+ sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
+ sz += c->leb_size - 1;
+ do_div(sz, c->leb_size);
+ lebs_needed = sz;
+ if (lebs_needed > c->lpt_lebs) {
+ ubifs_err("too few LPT LEBs");
+ return -EINVAL;
+ }
+
+ /* Verify that ltab fits in a single LEB (since ltab is a single node */
+ if (c->ltab_sz > c->leb_size) {
+ ubifs_err("LPT ltab too big");
+ return -EINVAL;
+ }
+
+ c->check_lpt_free = c->big_lpt;
+
+ return 0;
+}
+
+/**
+ * calc_dflt_lpt_geom - calculate default LPT geometry.
+ * @c: the UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @big_lpt: whether the LPT area is "big" is returned here
+ *
+ * The size of the LPT area depends on parameters that themselves are dependent
+ * on the size of the LPT area. This function, successively recalculates the LPT
+ * area geometry until the parameters and resultant geometry are consistent.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
+ int *big_lpt)
+{
+ int i, lebs_needed;
+ uint64_t sz;
+
+ /* Start by assuming the minimum number of LPT LEBs */
+ c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
+ c->main_lebs = *main_lebs - c->lpt_lebs;
+ if (c->main_lebs <= 0)
+ return -EINVAL;
+
+ /* And assume we will use the small LPT model */
+ c->big_lpt = 0;
+
+ /*
+ * Calculate the geometry based on assumptions above and then see if it
+ * makes sense
+ */
+ do_calc_lpt_geom(c);
+
+ /* Small LPT model must have lpt_sz < leb_size */
+ if (c->lpt_sz > c->leb_size) {
+ /* Nope, so try again using big LPT model */
+ c->big_lpt = 1;
+ do_calc_lpt_geom(c);
+ }
+
+ /* Now check there are enough LPT LEBs */
+ for (i = 0; i < 64 ; i++) {
+ sz = c->lpt_sz * 4; /* Allow 4 times the size */
+ sz += c->leb_size - 1;
+ do_div(sz, c->leb_size);
+ lebs_needed = sz;
+ if (lebs_needed > c->lpt_lebs) {
+ /* Not enough LPT LEBs so try again with more */
+ c->lpt_lebs = lebs_needed;
+ c->main_lebs = *main_lebs - c->lpt_lebs;
+ if (c->main_lebs <= 0)
+ return -EINVAL;
+ do_calc_lpt_geom(c);
+ continue;
+ }
+ if (c->ltab_sz > c->leb_size) {
+ ubifs_err("LPT ltab too big");
+ return -EINVAL;
+ }
+ *main_lebs = c->main_lebs;
+ *big_lpt = c->big_lpt;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+/**
+ * pack_bits - pack bit fields end-to-end.
+ * @addr: address at which to pack (passed and next address returned)
+ * @pos: bit position at which to pack (passed and next position returned)
+ * @val: value to pack
+ * @nrbits: number of bits of value to pack (1-32)
+ */
+static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
+{
+ uint8_t *p = *addr;
+ int b = *pos;
+
+ ubifs_assert(nrbits > 0);
+ ubifs_assert(nrbits <= 32);
+ ubifs_assert(*pos >= 0);
+ ubifs_assert(*pos < 8);
+ ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
+ if (b) {
+ *p |= ((uint8_t)val) << b;
+ nrbits += b;
+ if (nrbits > 8) {
+ *++p = (uint8_t)(val >>= (8 - b));
+ if (nrbits > 16) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 24) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 32)
+ *++p = (uint8_t)(val >>= 8);
+ }
+ }
+ }
+ } else {
+ *p = (uint8_t)val;
+ if (nrbits > 8) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 16) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 24)
+ *++p = (uint8_t)(val >>= 8);
+ }
+ }
+ }
+ b = nrbits & 7;
+ if (b == 0)
+ p++;
+ *addr = p;
+ *pos = b;
+}
+
+/**
+ * ubifs_unpack_bits - unpack bit fields.
+ * @addr: address at which to unpack (passed and next address returned)
+ * @pos: bit position at which to unpack (passed and next position returned)
+ * @nrbits: number of bits of value to unpack (1-32)
+ *
+ * This functions returns the value unpacked.
+ */
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
+{
+ const int k = 32 - nrbits;
+ uint8_t *p = *addr;
+ int b = *pos;
+ uint32_t val;
+
+ ubifs_assert(nrbits > 0);
+ ubifs_assert(nrbits <= 32);
+ ubifs_assert(*pos >= 0);
+ ubifs_assert(*pos < 8);
+ if (b) {
+ val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
+ ((uint32_t)p[4] << 24);
+ val <<= (8 - b);
+ val |= *p >> b;
+ nrbits += b;
+ } else
+ val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
+ ((uint32_t)p[3] << 24);
+ val <<= k;
+ val >>= k;
+ b = nrbits & 7;
+ p += nrbits / 8;
+ *addr = p;
+ *pos = b;
+ ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
+ return val;
+}
+
+/**
+ * ubifs_pack_pnode - pack all the bit fields of a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @pnode: pnode to pack
+ */
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+ struct ubifs_pnode *pnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
+ if (c->big_lpt)
+ pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
+ c->space_bits);
+ pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
+ c->space_bits);
+ if (pnode->lprops[i].flags & LPROPS_INDEX)
+ pack_bits(&addr, &pos, 1, 1);
+ else
+ pack_bits(&addr, &pos, 0, 1);
+ }
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->pnode_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_nnode - pack all the bit fields of a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @nnode: nnode to pack
+ */
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
+ if (c->big_lpt)
+ pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int lnum = nnode->nbranch[i].lnum;
+
+ if (lnum == 0)
+ lnum = c->lpt_last + 1;
+ pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
+ pack_bits(&addr, &pos, nnode->nbranch[i].offs,
+ c->lpt_offs_bits);
+ }
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->nnode_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_ltab - pack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @ltab: LPT's own lprops table to pack
+ */
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+ struct ubifs_lpt_lprops *ltab)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
+ for (i = 0; i < c->lpt_lebs; i++) {
+ pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
+ pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
+ }
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->ltab_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_lsave - pack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @lsave: LPT's save table to pack
+ */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
+ for (i = 0; i < c->lsave_cnt; i++)
+ pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->lsave_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to which to add dirty space
+ * @dirty: amount of dirty space to add
+ */
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+ if (!dirty || !lnum)
+ return;
+ dbg_lp("LEB %d add %d to %d",
+ lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
+ ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+ c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * set_ltab - set LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ */
+static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+ dbg_lp("LEB %d free %d dirty %d to %d %d",
+ lnum, c->ltab[lnum - c->lpt_first].free,
+ c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+ ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+ c->ltab[lnum - c->lpt_first].free = free;
+ c->ltab[lnum - c->lpt_first].dirty = dirty;
+}
+
+/**
+ * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode for which to add dirt
+ */
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
+{
+ struct ubifs_nnode *np = nnode->parent;
+
+ if (np)
+ ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
+ c->nnode_sz);
+ else {
+ ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
+ if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+ c->lpt_drty_flgs |= LTAB_DIRTY;
+ ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+ }
+ }
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+ c->pnode_sz);
+}
+
+/**
+ * calc_nnode_num - calculate nnode number.
+ * @row: the row in the tree (root is zero)
+ * @col: the column in the row (leftmost is zero)
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number for the nnode at @row
+ * and @col.
+ */
+static int calc_nnode_num(int row, int col)
+{
+ int num, bits;
+
+ num = 1;
+ while (row--) {
+ bits = (col & (UBIFS_LPT_FANOUT - 1));
+ col >>= UBIFS_LPT_FANOUT_SHIFT;
+ num <<= UBIFS_LPT_FANOUT_SHIFT;
+ num |= bits;
+ }
+ return num;
+}
+
+/**
+ * calc_nnode_num_from_parent - calculate nnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_nnode_num_from_parent(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ int num, shft;
+
+ if (!parent)
+ return 1;
+ shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
+ num = parent->num ^ (1 << shft);
+ num |= (UBIFS_LPT_FANOUT + iip) << shft;
+ return num;
+}
+
+/**
+ * calc_pnode_num_from_parent - calculate pnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The pnode number is a number that uniquely identifies a pnode and can be used
+ * easily to traverse the tree from the root to that pnode.
+ *
+ * This function calculates and returns the pnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_pnode_num_from_parent(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
+
+ for (i = 0; i < n; i++) {
+ num <<= UBIFS_LPT_FANOUT_SHIFT;
+ num |= pnum & (UBIFS_LPT_FANOUT - 1);
+ pnum >>= UBIFS_LPT_FANOUT_SHIFT;
+ }
+ num <<= UBIFS_LPT_FANOUT_SHIFT;
+ num |= iip;
+ return num;
+}
+
+/**
+ * ubifs_create_dflt_lpt - create default LPT.
+ * @c: UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @lpt_first: LEB number of first LPT LEB
+ * @lpt_lebs: number of LEBs for LPT is passed and returned here
+ * @big_lpt: use big LPT model is passed and returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+ int *lpt_lebs, int *big_lpt)
+{
+ int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
+ int blnum, boffs, bsz, bcnt;
+ struct ubifs_pnode *pnode = NULL;
+ struct ubifs_nnode *nnode = NULL;
+ void *buf = NULL, *p;
+ struct ubifs_lpt_lprops *ltab = NULL;
+ int *lsave = NULL;
+
+ err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
+ if (err)
+ return err;
+ *lpt_lebs = c->lpt_lebs;
+
+ /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
+ c->lpt_first = lpt_first;
+ /* Needed by 'set_ltab()' */
+ c->lpt_last = lpt_first + c->lpt_lebs - 1;
+ /* Needed by 'ubifs_pack_lsave()' */
+ c->main_first = c->leb_cnt - *main_lebs;
+
+ lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
+ pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
+ nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
+ buf = vmalloc(c->leb_size);
+ ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ if (!pnode || !nnode || !buf || !ltab || !lsave) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ubifs_assert(!c->ltab);
+ c->ltab = ltab; /* Needed by set_ltab */
+
+ /* Initialize LPT's own lprops */
+ for (i = 0; i < c->lpt_lebs; i++) {
+ ltab[i].free = c->leb_size;
+ ltab[i].dirty = 0;
+ ltab[i].tgc = 0;
+ ltab[i].cmt = 0;
+ }
+
+ lnum = lpt_first;
+ p = buf;
+ /* Number of leaf nodes (pnodes) */
+ cnt = c->pnode_cnt;
+
+ /*
+ * The first pnode contains the LEB properties for the LEBs that contain
+ * the root inode node and the root index node of the index tree.
+ */
+ node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
+ iopos = ALIGN(node_sz, c->min_io_size);
+ pnode->lprops[0].free = c->leb_size - iopos;
+ pnode->lprops[0].dirty = iopos - node_sz;
+ pnode->lprops[0].flags = LPROPS_INDEX;
+
+ node_sz = UBIFS_INO_NODE_SZ;
+ iopos = ALIGN(node_sz, c->min_io_size);
+ pnode->lprops[1].free = c->leb_size - iopos;
+ pnode->lprops[1].dirty = iopos - node_sz;
+
+ for (i = 2; i < UBIFS_LPT_FANOUT; i++)
+ pnode->lprops[i].free = c->leb_size;
+
+ /* Add first pnode */
+ ubifs_pack_pnode(c, p, pnode);
+ p += c->pnode_sz;
+ len = c->pnode_sz;
+ pnode->num += 1;
+
+ /* Reset pnode values for remaining pnodes */
+ pnode->lprops[0].free = c->leb_size;
+ pnode->lprops[0].dirty = 0;
+ pnode->lprops[0].flags = 0;
+
+ pnode->lprops[1].free = c->leb_size;
+ pnode->lprops[1].dirty = 0;
+
+ /*
+ * To calculate the internal node branches, we keep information about
+ * the level below.
+ */
+ blnum = lnum; /* LEB number of level below */
+ boffs = 0; /* Offset of level below */
+ bcnt = cnt; /* Number of nodes in level below */
+ bsz = c->pnode_sz; /* Size of nodes in level below */
+
+ /* Add all remaining pnodes */
+ for (i = 1; i < cnt; i++) {
+ if (len + c->pnode_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+ UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+ ubifs_pack_pnode(c, p, pnode);
+ p += c->pnode_sz;
+ len += c->pnode_sz;
+ /*
+ * pnodes are simply numbered left to right starting at zero,
+ * which means the pnode number can be used easily to traverse
+ * down the tree to the corresponding pnode.
+ */
+ pnode->num += 1;
+ }
+
+ row = 0;
+ for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
+ row += 1;
+ /* Add all nnodes, one level at a time */
+ while (1) {
+ /* Number of internal nodes (nnodes) at next level */
+ cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
+ for (i = 0; i < cnt; i++) {
+ if (len + c->nnode_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen,
+ alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+ UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+ /* Only 1 nnode at this level, so it is the root */
+ if (cnt == 1) {
+ c->lpt_lnum = lnum;
+ c->lpt_offs = len;
+ }
+ /* Set branches to the level below */
+ for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
+ if (bcnt) {
+ if (boffs + bsz > c->leb_size) {
+ blnum += 1;
+ boffs = 0;
+ }
+ nnode->nbranch[j].lnum = blnum;
+ nnode->nbranch[j].offs = boffs;
+ boffs += bsz;
+ bcnt--;
+ } else {
+ nnode->nbranch[j].lnum = 0;
+ nnode->nbranch[j].offs = 0;
+ }
+ }
+ nnode->num = calc_nnode_num(row, i);
+ ubifs_pack_nnode(c, p, nnode);
+ p += c->nnode_sz;
+ len += c->nnode_sz;
+ }
+ /* Only 1 nnode at this level, so it is the root */
+ if (cnt == 1)
+ break;
+ /* Update the information about the level below */
+ bcnt = cnt;
+ bsz = c->nnode_sz;
+ row -= 1;
+ }
+
+ if (*big_lpt) {
+ /* Need to add LPT's save table */
+ if (len + c->lsave_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+ UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+
+ c->lsave_lnum = lnum;
+ c->lsave_offs = len;
+
+ for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
+ lsave[i] = c->main_first + i;
+ for (; i < c->lsave_cnt; i++)
+ lsave[i] = c->main_first;
+
+ ubifs_pack_lsave(c, p, lsave);
+ p += c->lsave_sz;
+ len += c->lsave_sz;
+ }
+
+ /* Need to add LPT's own LEB properties table */
+ if (len + c->ltab_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+
+ c->ltab_lnum = lnum;
+ c->ltab_offs = len;
+
+ /* Update ltab before packing it */
+ len += c->ltab_sz;
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+
+ ubifs_pack_ltab(c, p, ltab);
+ p += c->ltab_sz;
+
+ /* Write remaining buffer */
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
+ if (err)
+ goto out;
+
+ c->nhead_lnum = lnum;
+ c->nhead_offs = ALIGN(len, c->min_io_size);
+
+ dbg_lp("space_bits %d", c->space_bits);
+ dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+ dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+ dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+ dbg_lp("pcnt_bits %d", c->pcnt_bits);
+ dbg_lp("lnum_bits %d", c->lnum_bits);
+ dbg_lp("pnode_sz %d", c->pnode_sz);
+ dbg_lp("nnode_sz %d", c->nnode_sz);
+ dbg_lp("ltab_sz %d", c->ltab_sz);
+ dbg_lp("lsave_sz %d", c->lsave_sz);
+ dbg_lp("lsave_cnt %d", c->lsave_cnt);
+ dbg_lp("lpt_hght %d", c->lpt_hght);
+ dbg_lp("big_lpt %d", c->big_lpt);
+ dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+ dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+ dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+out:
+ c->ltab = NULL;
+ kfree(lsave);
+ vfree(ltab);
+ vfree(buf);
+ kfree(nnode);
+ kfree(pnode);
+ return err;
+}
+
+/**
+ * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * When a pnode is loaded into memory, the LEB properties it contains are added,
+ * by this function, to the LEB category lists and heaps.
+ */
+static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ int i;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
+ int lnum = pnode->lprops[i].lnum;
+
+ if (!lnum)
+ return;
+ ubifs_add_to_cat(c, &pnode->lprops[i], cat);
+ }
+}
+
+/**
+ * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @old_pnode: pnode copied
+ * @new_pnode: pnode copy
+ *
+ * During commit it is sometimes necessary to copy a pnode
+ * (see dirty_cow_pnode). When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
+ struct ubifs_pnode *new_pnode)
+{
+ int i;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (!new_pnode->lprops[i].lnum)
+ return;
+ ubifs_replace_cat(c, &old_pnode->lprops[i],
+ &new_pnode->lprops[i]);
+ }
+}
+
+/**
+ * check_lpt_crc - check LPT node crc is correct.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing node
+ * @len: length of node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_crc(void *buf, int len)
+{
+ int pos = 0;
+ uint8_t *addr = buf;
+ uint16_t crc, calc_crc;
+
+ crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+ calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ len - UBIFS_LPT_CRC_BYTES);
+ if (crc != calc_crc) {
+ ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
+ calc_crc);
+ dbg_dump_stack();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * check_lpt_type - check LPT node type is correct.
+ * @c: UBIFS file-system description object
+ * @addr: address of type bit field is passed and returned updated here
+ * @pos: position of type bit field is passed and returned updated here
+ * @type: expected type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_type(uint8_t **addr, int *pos, int type)
+{
+ int node_type;
+
+ node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
+ if (node_type != type) {
+ ubifs_err("invalid type (%d) in LPT node type %d", node_type,
+ type);
+ dbg_dump_stack();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * unpack_pnode - unpack a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed pnode to unpack
+ * @pnode: pnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_pnode(struct ubifs_info *c, void *buf,
+ struct ubifs_pnode *pnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
+ if (err)
+ return err;
+ if (c->big_lpt)
+ pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+ lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+ lprops->free <<= 3;
+ lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+ lprops->dirty <<= 3;
+
+ if (ubifs_unpack_bits(&addr, &pos, 1))
+ lprops->flags = LPROPS_INDEX;
+ else
+ lprops->flags = 0;
+ lprops->flags |= ubifs_categorize_lprops(c, lprops);
+ }
+ err = check_lpt_crc(buf, c->pnode_sz);
+ return err;
+}
+
+/**
+ * unpack_nnode - unpack a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed nnode to unpack
+ * @nnode: nnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_nnode(struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
+ if (err)
+ return err;
+ if (c->big_lpt)
+ nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int lnum;
+
+ lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
+ c->lpt_first;
+ if (lnum == c->lpt_last + 1)
+ lnum = 0;
+ nnode->nbranch[i].lnum = lnum;
+ nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
+ c->lpt_offs_bits);
+ }
+ err = check_lpt_crc(buf, c->nnode_sz);
+ return err;
+}
+
+/**
+ * unpack_ltab - unpack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_ltab(struct ubifs_info *c, void *buf)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
+ if (err)
+ return err;
+ for (i = 0; i < c->lpt_lebs; i++) {
+ int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+ int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+
+ if (free < 0 || free > c->leb_size || dirty < 0 ||
+ dirty > c->leb_size || free + dirty > c->leb_size)
+ return -EINVAL;
+
+ c->ltab[i].free = free;
+ c->ltab[i].dirty = dirty;
+ c->ltab[i].tgc = 0;
+ c->ltab[i].cmt = 0;
+ }
+ err = check_lpt_crc(buf, c->ltab_sz);
+ return err;
+}
+
+/**
+ * unpack_lsave - unpack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_lsave(struct ubifs_info *c, void *buf)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
+ if (err)
+ return err;
+ for (i = 0; i < c->lsave_cnt; i++) {
+ int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
+
+ if (lnum < c->main_first || lnum >= c->leb_cnt)
+ return -EINVAL;
+ c->lsave[i] = lnum;
+ }
+ err = check_lpt_crc(buf, c->lsave_sz);
+ return err;
+}
+
+/**
+ * validate_nnode - validate a nnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to validate
+ * @parent: parent nnode (or NULL for the root nnode)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i, lvl, max_offs;
+
+ if (c->big_lpt) {
+ int num = calc_nnode_num_from_parent(c, parent, iip);
+
+ if (nnode->num != num)
+ return -EINVAL;
+ }
+ lvl = parent ? parent->level - 1 : c->lpt_hght;
+ if (lvl < 1)
+ return -EINVAL;
+ if (lvl == 1)
+ max_offs = c->leb_size - c->pnode_sz;
+ else
+ max_offs = c->leb_size - c->nnode_sz;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int lnum = nnode->nbranch[i].lnum;
+ int offs = nnode->nbranch[i].offs;
+
+ if (lnum == 0) {
+ if (offs != 0)
+ return -EINVAL;
+ continue;
+ }
+ if (lnum < c->lpt_first || lnum > c->lpt_last)
+ return -EINVAL;
+ if (offs < 0 || offs > max_offs)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * validate_pnode - validate a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to validate
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i;
+
+ if (c->big_lpt) {
+ int num = calc_pnode_num_from_parent(c, parent, iip);
+
+ if (pnode->num != num)
+ return -EINVAL;
+ }
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int free = pnode->lprops[i].free;
+ int dirty = pnode->lprops[i].dirty;
+
+ if (free < 0 || free > c->leb_size || free % c->min_io_size ||
+ (free & 7))
+ return -EINVAL;
+ if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
+ return -EINVAL;
+ if (dirty + free > c->leb_size)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * set_pnode_lnum - set LEB numbers on a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to update
+ *
+ * This function calculates the LEB numbers for the LEB properties it contains
+ * based on the pnode number.
+ */
+static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ int i, lnum;
+
+ lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (lnum >= c->leb_cnt)
+ return;
+ pnode->lprops[i].lnum = lnum++;
+ }
+}
+
+/**
+ * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch = NULL;
+ struct ubifs_nnode *nnode = NULL;
+ void *buf = c->lpt_nod_buf;
+ int err, lnum, offs;
+
+ if (parent) {
+ branch = &parent->nbranch[iip];
+ lnum = branch->lnum;
+ offs = branch->offs;
+ } else {
+ lnum = c->lpt_lnum;
+ offs = c->lpt_offs;
+ }
+ nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ if (!nnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ if (lnum == 0) {
+ /*
+ * This nnode was not written which just means that the LEB
+ * properties in the subtree below it describe empty LEBs. We
+ * make the nnode as though we had read it, which in fact means
+ * doing almost nothing.
+ */
+ if (c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ } else {
+ err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
+ if (err)
+ goto out;
+ err = unpack_nnode(c, buf, nnode);
+ if (err)
+ goto out;
+ }
+ err = validate_nnode(c, nnode, parent, iip);
+ if (err)
+ goto out;
+ if (!c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ if (parent) {
+ branch->nnode = nnode;
+ nnode->level = parent->level - 1;
+ } else {
+ c->nroot = nnode;
+ nnode->level = c->lpt_hght;
+ }
+ nnode->parent = parent;
+ nnode->iip = iip;
+ return 0;
+
+out:
+ ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+ kfree(nnode);
+ return err;
+}
+
+/**
+ * read_pnode - read a pnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_pnode *pnode = NULL;
+ void *buf = c->lpt_nod_buf;
+ int err, lnum, offs;
+
+ branch = &parent->nbranch[iip];
+ lnum = branch->lnum;
+ offs = branch->offs;
+ pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ if (!pnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ if (lnum == 0) {
+ /*
+ * This pnode was not written which just means that the LEB
+ * properties in it describe empty LEBs. We make the pnode as
+ * though we had read it.
+ */
+ int i;
+
+ if (c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+ lprops->free = c->leb_size;
+ lprops->flags = ubifs_categorize_lprops(c, lprops);
+ }
+ } else {
+ err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
+ if (err)
+ goto out;
+ err = unpack_pnode(c, buf, pnode);
+ if (err)
+ goto out;
+ }
+ err = validate_pnode(c, pnode, parent, iip);
+ if (err)
+ goto out;
+ if (!c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ branch->pnode = pnode;
+ pnode->parent = parent;
+ pnode->iip = iip;
+ set_pnode_lnum(c, pnode);
+ c->pnodes_have += 1;
+ return 0;
+
+out:
+ ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
+ dbg_dump_pnode(c, pnode, parent, iip);
+ dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+ kfree(pnode);
+ return err;
+}
+
+/**
+ * read_ltab - read LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_ltab(struct ubifs_info *c)
+{
+ int err;
+ void *buf;
+
+ buf = vmalloc(c->ltab_sz);
+ if (!buf)
+ return -ENOMEM;
+ err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
+ if (err)
+ goto out;
+ err = unpack_ltab(c, buf);
+out:
+ vfree(buf);
+ return err;
+}
+
+/**
+ * read_lsave - read LPT's save table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_lsave(struct ubifs_info *c)
+{
+ int err, i;
+ void *buf;
+
+ buf = vmalloc(c->lsave_sz);
+ if (!buf)
+ return -ENOMEM;
+ err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
+ if (err)
+ goto out;
+ err = unpack_lsave(c, buf);
+ if (err)
+ goto out;
+ for (i = 0; i < c->lsave_cnt; i++) {
+ int lnum = c->lsave[i];
+
+ /*
+ * Due to automatic resizing, the values in the lsave table
+ * could be beyond the volume size - just ignore them.
+ */
+ if (lnum >= c->leb_cnt)
+ continue;
+ ubifs_lpt_lookup(c, lnum);
+ }
+out:
+ vfree(buf);
+ return err;
+}
+
+/**
+ * ubifs_get_nnode - get a nnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_nnode *nnode;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ nnode = branch->nnode;
+ if (nnode)
+ return nnode;
+ err = ubifs_read_nnode(c, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ return branch->nnode;
+}
+
+/**
+ * ubifs_get_pnode - get a pnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_pnode *pnode;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ pnode = branch->pnode;
+ if (pnode)
+ return pnode;
+ err = read_pnode(c, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ update_cats(c, branch->pnode);
+ return branch->pnode;
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+ int err, i, h, iip, shft;
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ nnode = c->nroot;
+ i = lnum - c->main_first;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ pnode = ubifs_get_pnode(c, nnode, iip);
+ if (IS_ERR(pnode))
+ return ERR_PTR(PTR_ERR(pnode));
+ iip = (i & (UBIFS_LPT_FANOUT - 1));
+ dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+ pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+ pnode->lprops[iip].flags);
+ return &pnode->lprops[iip];
+}
+
+/**
+ * dirty_cow_nnode - ensure a nnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to check
+ *
+ * Returns dirtied nnode on success or negative error code on failure.
+ */
+static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *nnode)
+{
+ struct ubifs_nnode *n;
+ int i;
+
+ if (!test_bit(COW_CNODE, &nnode->flags)) {
+ /* nnode is not being committed */
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ }
+ return nnode;
+ }
+
+ /* nnode is being committed, so copy it */
+ n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ if (unlikely(!n))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(n, nnode, sizeof(struct ubifs_nnode));
+ n->cnext = NULL;
+ __set_bit(DIRTY_CNODE, &n->flags);
+ __clear_bit(COW_CNODE, &n->flags);
+
+ /* The children now have new parent */
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_nbranch *branch = &n->nbranch[i];
+
+ if (branch->cnode)
+ branch->cnode->parent = n;
+ }
+
+ ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
+ __set_bit(OBSOLETE_CNODE, &nnode->flags);
+
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ if (nnode->parent)
+ nnode->parent->nbranch[n->iip].nnode = n;
+ else
+ c->nroot = n;
+ return n;
+}
+
+/**
+ * dirty_cow_pnode - ensure a pnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to check
+ *
+ * Returns dirtied pnode on success or negative error code on failure.
+ */
+static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
+ struct ubifs_pnode *pnode)
+{
+ struct ubifs_pnode *p;
+
+ if (!test_bit(COW_CNODE, &pnode->flags)) {
+ /* pnode is not being committed */
+ if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+ c->dirty_pn_cnt += 1;
+ add_pnode_dirt(c, pnode);
+ }
+ return pnode;
+ }
+
+ /* pnode is being committed, so copy it */
+ p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ if (unlikely(!p))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(p, pnode, sizeof(struct ubifs_pnode));
+ p->cnext = NULL;
+ __set_bit(DIRTY_CNODE, &p->flags);
+ __clear_bit(COW_CNODE, &p->flags);
+ replace_cats(c, pnode, p);
+
+ ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
+ __set_bit(OBSOLETE_CNODE, &pnode->flags);
+
+ c->dirty_pn_cnt += 1;
+ add_pnode_dirt(c, pnode);
+ pnode->parent->nbranch[p->iip].pnode = p;
+ return p;
+}
+
+/**
+ * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
+{
+ int err, i, h, iip, shft;
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ nnode = c->nroot;
+ nnode = dirty_cow_nnode(c, nnode);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ i = lnum - c->main_first;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ nnode = dirty_cow_nnode(c, nnode);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ pnode = ubifs_get_pnode(c, nnode, iip);
+ if (IS_ERR(pnode))
+ return ERR_PTR(PTR_ERR(pnode));
+ pnode = dirty_cow_pnode(c, pnode);
+ if (IS_ERR(pnode))
+ return ERR_PTR(PTR_ERR(pnode));
+ iip = (i & (UBIFS_LPT_FANOUT - 1));
+ dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+ pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+ pnode->lprops[iip].flags);
+ ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
+ return &pnode->lprops[iip];
+}
+
+/**
+ * lpt_init_rd - initialize the LPT for reading.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_rd(struct ubifs_info *c)
+{
+ int err, i;
+
+ c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ if (!c->ltab)
+ return -ENOMEM;
+
+ i = max_t(int, c->nnode_sz, c->pnode_sz);
+ c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
+ if (!c->lpt_nod_buf)
+ return -ENOMEM;
+
+ for (i = 0; i < LPROPS_HEAP_CNT; i++) {
+ c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
+ GFP_KERNEL);
+ if (!c->lpt_heap[i].arr)
+ return -ENOMEM;
+ c->lpt_heap[i].cnt = 0;
+ c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
+ }
+
+ c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
+ if (!c->dirty_idx.arr)
+ return -ENOMEM;
+ c->dirty_idx.cnt = 0;
+ c->dirty_idx.max_cnt = LPT_HEAP_SZ;
+
+ err = read_ltab(c);
+ if (err)
+ return err;
+
+ dbg_lp("space_bits %d", c->space_bits);
+ dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+ dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+ dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+ dbg_lp("pcnt_bits %d", c->pcnt_bits);
+ dbg_lp("lnum_bits %d", c->lnum_bits);
+ dbg_lp("pnode_sz %d", c->pnode_sz);
+ dbg_lp("nnode_sz %d", c->nnode_sz);
+ dbg_lp("ltab_sz %d", c->ltab_sz);
+ dbg_lp("lsave_sz %d", c->lsave_sz);
+ dbg_lp("lsave_cnt %d", c->lsave_cnt);
+ dbg_lp("lpt_hght %d", c->lpt_hght);
+ dbg_lp("big_lpt %d", c->big_lpt);
+ dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+ dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+ dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+ return 0;
+}
+
+/**
+ * lpt_init_wr - initialize the LPT for writing.
+ * @c: UBIFS file-system description object
+ *
+ * 'lpt_init_rd()' must have been called already.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_wr(struct ubifs_info *c)
+{
+ int err, i;
+
+ c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ if (!c->ltab_cmt)
+ return -ENOMEM;
+
+ c->lpt_buf = vmalloc(c->leb_size);
+ if (!c->lpt_buf)
+ return -ENOMEM;
+
+ if (c->big_lpt) {
+ c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
+ if (!c->lsave)
+ return -ENOMEM;
+ err = read_lsave(c);
+ if (err)
+ return err;
+ }
+
+ for (i = 0; i < c->lpt_lebs; i++)
+ if (c->ltab[i].free == c->leb_size) {
+ err = ubifs_leb_unmap(c, i + c->lpt_first);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_lpt_init - initialize the LPT.
+ * @c: UBIFS file-system description object
+ * @rd: whether to initialize lpt for reading
+ * @wr: whether to initialize lpt for writing
+ *
+ * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
+ * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
+ * true.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
+{
+ int err;
+
+ if (rd) {
+ err = lpt_init_rd(c);
+ if (err)
+ return err;
+ }
+
+ if (wr) {
+ err = lpt_init_wr(c);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
+ * @nnode: where to keep a nnode
+ * @pnode: where to keep a pnode
+ * @cnode: where to keep a cnode
+ * @in_tree: is the node in the tree in memory
+ * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
+ * the tree
+ * @ptr.pnode: ditto for pnode
+ * @ptr.cnode: ditto for cnode
+ */
+struct lpt_scan_node {
+ union {
+ struct ubifs_nnode nnode;
+ struct ubifs_pnode pnode;
+ struct ubifs_cnode cnode;
+ };
+ int in_tree;
+ union {
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+ struct ubifs_cnode *cnode;
+ } ptr;
+};
+
+/**
+ * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the nnode
+ * @parent: parent of the nnode
+ * @iip: index in parent of the nnode
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
+ struct lpt_scan_node *path,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_nnode *nnode;
+ void *buf = c->lpt_nod_buf;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ nnode = branch->nnode;
+ if (nnode) {
+ path->in_tree = 1;
+ path->ptr.nnode = nnode;
+ return nnode;
+ }
+ nnode = &path->nnode;
+ path->in_tree = 0;
+ path->ptr.nnode = nnode;
+ memset(nnode, 0, sizeof(struct ubifs_nnode));
+ if (branch->lnum == 0) {
+ /*
+ * This nnode was not written which just means that the LEB
+ * properties in the subtree below it describe empty LEBs. We
+ * make the nnode as though we had read it, which in fact means
+ * doing almost nothing.
+ */
+ if (c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ } else {
+ err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+ c->nnode_sz);
+ if (err)
+ return ERR_PTR(err);
+ err = unpack_nnode(c, buf, nnode);
+ if (err)
+ return ERR_PTR(err);
+ }
+ err = validate_nnode(c, nnode, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ if (!c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ nnode->level = parent->level - 1;
+ nnode->parent = parent;
+ nnode->iip = iip;
+ return nnode;
+}
+
+/**
+ * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the pnode
+ * @parent: parent of the pnode
+ * @iip: index in parent of the pnode
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
+ struct lpt_scan_node *path,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_pnode *pnode;
+ void *buf = c->lpt_nod_buf;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ pnode = branch->pnode;
+ if (pnode) {
+ path->in_tree = 1;
+ path->ptr.pnode = pnode;
+ return pnode;
+ }
+ pnode = &path->pnode;
+ path->in_tree = 0;
+ path->ptr.pnode = pnode;
+ memset(pnode, 0, sizeof(struct ubifs_pnode));
+ if (branch->lnum == 0) {
+ /*
+ * This pnode was not written which just means that the LEB
+ * properties in it describe empty LEBs. We make the pnode as
+ * though we had read it.
+ */
+ int i;
+
+ if (c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+ lprops->free = c->leb_size;
+ lprops->flags = ubifs_categorize_lprops(c, lprops);
+ }
+ } else {
+ ubifs_assert(branch->lnum >= c->lpt_first &&
+ branch->lnum <= c->lpt_last);
+ ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
+ err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+ c->pnode_sz);
+ if (err)
+ return ERR_PTR(err);
+ err = unpack_pnode(c, buf, pnode);
+ if (err)
+ return ERR_PTR(err);
+ }
+ err = validate_pnode(c, pnode, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ if (!c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ pnode->parent = parent;
+ pnode->iip = iip;
+ set_pnode_lnum(c, pnode);
+ return pnode;
+}
+
+/**
+ * ubifs_lpt_scan_nolock - scan the LPT.
+ * @c: the UBIFS file-system description object
+ * @start_lnum: LEB number from which to start scanning
+ * @end_lnum: LEB number at which to stop scanning
+ * @scan_cb: callback function called for each lprops
+ * @data: data to be passed to the callback function
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+ ubifs_lpt_scan_callback scan_cb, void *data)
+{
+ int err = 0, i, h, iip, shft;
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+ struct lpt_scan_node *path;
+
+ if (start_lnum == -1) {
+ start_lnum = end_lnum + 1;
+ if (start_lnum >= c->leb_cnt)
+ start_lnum = c->main_first;
+ }
+
+ ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
+ ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return err;
+ }
+
+ path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
+ GFP_NOFS);
+ if (!path)
+ return -ENOMEM;
+
+ path[0].ptr.nnode = c->nroot;
+ path[0].in_tree = 1;
+again:
+ /* Descend to the pnode containing start_lnum */
+ nnode = c->nroot;
+ i = start_lnum - c->main_first;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = scan_get_nnode(c, path + h, nnode, iip);
+ if (IS_ERR(nnode)) {
+ err = PTR_ERR(nnode);
+ goto out;
+ }
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ pnode = scan_get_pnode(c, path + h, nnode, iip);
+ if (IS_ERR(pnode)) {
+ err = PTR_ERR(pnode);
+ goto out;
+ }
+ iip = (i & (UBIFS_LPT_FANOUT - 1));
+
+ /* Loop for each lprops */
+ while (1) {
+ struct ubifs_lprops *lprops = &pnode->lprops[iip];
+ int ret, lnum = lprops->lnum;
+
+ ret = scan_cb(c, lprops, path[h].in_tree, data);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret & LPT_SCAN_ADD) {
+ /* Add all the nodes in path to the tree in memory */
+ for (h = 1; h < c->lpt_hght; h++) {
+ const size_t sz = sizeof(struct ubifs_nnode);
+ struct ubifs_nnode *parent;
+
+ if (path[h].in_tree)
+ continue;
+ nnode = kmalloc(sz, GFP_NOFS);
+ if (!nnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(nnode, &path[h].nnode, sz);
+ parent = nnode->parent;
+ parent->nbranch[nnode->iip].nnode = nnode;
+ path[h].ptr.nnode = nnode;
+ path[h].in_tree = 1;
+ path[h + 1].cnode.parent = nnode;
+ }
+ if (path[h].in_tree)
+ ubifs_ensure_cat(c, lprops);
+ else {
+ const size_t sz = sizeof(struct ubifs_pnode);
+ struct ubifs_nnode *parent;
+
+ pnode = kmalloc(sz, GFP_NOFS);
+ if (!pnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(pnode, &path[h].pnode, sz);
+ parent = pnode->parent;
+ parent->nbranch[pnode->iip].pnode = pnode;
+ path[h].ptr.pnode = pnode;
+ path[h].in_tree = 1;
+ update_cats(c, pnode);
+ c->pnodes_have += 1;
+ }
+ err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
+ c->nroot, 0, 0);
+ if (err)
+ goto out;
+ err = dbg_check_cats(c);
+ if (err)
+ goto out;
+ }
+ if (ret & LPT_SCAN_STOP) {
+ err = 0;
+ break;
+ }
+ /* Get the next lprops */
+ if (lnum == end_lnum) {
+ /*
+ * We got to the end without finding what we were
+ * looking for
+ */
+ err = -ENOSPC;
+ goto out;
+ }
+ if (lnum + 1 >= c->leb_cnt) {
+ /* Wrap-around to the beginning */
+ start_lnum = c->main_first;
+ goto again;
+ }
+ if (iip + 1 < UBIFS_LPT_FANOUT) {
+ /* Next lprops is in the same pnode */
+ iip += 1;
+ continue;
+ }
+ /* We need to get the next pnode. Go up until we can go right */
+ iip = pnode->iip;
+ while (1) {
+ h -= 1;
+ ubifs_assert(h >= 0);
+ nnode = path[h].ptr.nnode;
+ if (iip + 1 < UBIFS_LPT_FANOUT)
+ break;
+ iip = nnode->iip;
+ }
+ /* Go right */
+ iip += 1;
+ /* Descend to the pnode */
+ h += 1;
+ for (; h < c->lpt_hght; h++) {
+ nnode = scan_get_nnode(c, path + h, nnode, iip);
+ if (IS_ERR(nnode)) {
+ err = PTR_ERR(nnode);
+ goto out;
+ }
+ iip = 0;
+ }
+ pnode = scan_get_pnode(c, path + h, nnode, iip);
+ if (IS_ERR(pnode)) {
+ err = PTR_ERR(pnode);
+ goto out;
+ }
+ iip = 0;
+ }
+out:
+ kfree(path);
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_chk_pnode - check a pnode.
+ * @c: the UBIFS file-system description object
+ * @pnode: pnode to check
+ * @col: pnode column
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ int col)
+{
+ int i;
+
+ if (pnode->num != col) {
+ dbg_err("pnode num %d expected %d parent num %d iip %d",
+ pnode->num, col, pnode->parent->num, pnode->iip);
+ return -EINVAL;
+ }
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
+ int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
+ c->main_first;
+ int found, cat = lprops->flags & LPROPS_CAT_MASK;
+ struct ubifs_lpt_heap *heap;
+ struct list_head *list = NULL;
+
+ if (lnum >= c->leb_cnt)
+ continue;
+ if (lprops->lnum != lnum) {
+ dbg_err("bad LEB number %d expected %d",
+ lprops->lnum, lnum);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ if (cat != LPROPS_UNCAT) {
+ dbg_err("LEB %d taken but not uncat %d",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ continue;
+ }
+ if (lprops->flags & LPROPS_INDEX) {
+ switch (cat) {
+ case LPROPS_UNCAT:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FRDI_IDX:
+ break;
+ default:
+ dbg_err("LEB %d index but cat %d",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ } else {
+ switch (cat) {
+ case LPROPS_UNCAT:
+ case LPROPS_DIRTY:
+ case LPROPS_FREE:
+ case LPROPS_EMPTY:
+ case LPROPS_FREEABLE:
+ break;
+ default:
+ dbg_err("LEB %d not index but cat %d",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ }
+ switch (cat) {
+ case LPROPS_UNCAT:
+ list = &c->uncat_list;
+ break;
+ case LPROPS_EMPTY:
+ list = &c->empty_list;
+ break;
+ case LPROPS_FREEABLE:
+ list = &c->freeable_list;
+ break;
+ case LPROPS_FRDI_IDX:
+ list = &c->frdi_idx_list;
+ break;
+ }
+ found = 0;
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ heap = &c->lpt_heap[cat - 1];
+ if (lprops->hpos < heap->cnt &&
+ heap->arr[lprops->hpos] == lprops)
+ found = 1;
+ break;
+ case LPROPS_UNCAT:
+ case LPROPS_EMPTY:
+ case LPROPS_FREEABLE:
+ case LPROPS_FRDI_IDX:
+ list_for_each_entry(lp, list, list)
+ if (lprops == lp) {
+ found = 1;
+ break;
+ }
+ break;
+ }
+ if (!found) {
+ dbg_err("LEB %d cat %d not found in cat heap/list",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ switch (cat) {
+ case LPROPS_EMPTY:
+ if (lprops->free != c->leb_size) {
+ dbg_err("LEB %d cat %d free %d dirty %d",
+ lprops->lnum, cat, lprops->free,
+ lprops->dirty);
+ return -EINVAL;
+ }
+ case LPROPS_FREEABLE:
+ case LPROPS_FRDI_IDX:
+ if (lprops->free + lprops->dirty != c->leb_size) {
+ dbg_err("LEB %d cat %d free %d dirty %d",
+ lprops->lnum, cat, lprops->free,
+ lprops->dirty);
+ return -EINVAL;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * dbg_check_lpt_nodes - check nnodes and pnodes.
+ * @c: the UBIFS file-system description object
+ * @cnode: next cnode (nnode or pnode) to check
+ * @row: row of cnode (root is zero)
+ * @col: column of cnode (leftmost is zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+ int row, int col)
+{
+ struct ubifs_nnode *nnode, *nn;
+ struct ubifs_cnode *cn;
+ int num, iip = 0, err;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
+ while (cnode) {
+ ubifs_assert(row >= 0);
+ nnode = cnode->parent;
+ if (cnode->level) {
+ /* cnode is a nnode */
+ num = calc_nnode_num(row, col);
+ if (cnode->num != num) {
+ dbg_err("nnode num %d expected %d "
+ "parent num %d iip %d", cnode->num, num,
+ (nnode ? nnode->num : 0), cnode->iip);
+ return -EINVAL;
+ }
+ nn = (struct ubifs_nnode *)cnode;
+ while (iip < UBIFS_LPT_FANOUT) {
+ cn = nn->nbranch[iip].cnode;
+ if (cn) {
+ /* Go down */
+ row += 1;
+ col <<= UBIFS_LPT_FANOUT_SHIFT;
+ col += iip;
+ iip = 0;
+ cnode = cn;
+ break;
+ }
+ /* Go right */
+ iip += 1;
+ }
+ if (iip < UBIFS_LPT_FANOUT)
+ continue;
+ } else {
+ struct ubifs_pnode *pnode;
+
+ /* cnode is a pnode */
+ pnode = (struct ubifs_pnode *)cnode;
+ err = dbg_chk_pnode(c, pnode, col);
+ if (err)
+ return err;
+ }
+ /* Go up and to the right */
+ row -= 1;
+ col >>= UBIFS_LPT_FANOUT_SHIFT;
+ iip = cnode->iip + 1;
+ cnode = (struct ubifs_cnode *)nnode;
+ }
+ return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 00000000000..5f0b83e20af
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,1648 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements commit-related functionality of the LEB properties
+ * subsystem.
+ */
+
+#include <linux/crc16.h>
+#include "ubifs.h"
+
+/**
+ * first_dirty_cnode - find first dirty cnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode at which to start
+ *
+ * This function returns the first dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
+{
+ ubifs_assert(nnode);
+ while (1) {
+ int i, cont = 0;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_cnode *cnode;
+
+ cnode = nnode->nbranch[i].cnode;
+ if (cnode &&
+ test_bit(DIRTY_CNODE, &cnode->flags)) {
+ if (cnode->level == 0)
+ return cnode;
+ nnode = (struct ubifs_nnode *)cnode;
+ cont = 1;
+ break;
+ }
+ }
+ if (!cont)
+ return (struct ubifs_cnode *)nnode;
+ }
+}
+
+/**
+ * next_dirty_cnode - find next dirty cnode.
+ * @cnode: cnode from which to begin searching
+ *
+ * This function returns the next dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
+{
+ struct ubifs_nnode *nnode;
+ int i;
+
+ ubifs_assert(cnode);
+ nnode = cnode->parent;
+ if (!nnode)
+ return NULL;
+ for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
+ cnode = nnode->nbranch[i].cnode;
+ if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
+ if (cnode->level == 0)
+ return cnode; /* cnode is a pnode */
+ /* cnode is a nnode */
+ return first_dirty_cnode((struct ubifs_nnode *)cnode);
+ }
+ }
+ return (struct ubifs_cnode *)nnode;
+}
+
+/**
+ * get_cnodes_to_commit - create list of dirty cnodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of cnodes to commit.
+ */
+static int get_cnodes_to_commit(struct ubifs_info *c)
+{
+ struct ubifs_cnode *cnode, *cnext;
+ int cnt = 0;
+
+ if (!c->nroot)
+ return 0;
+
+ if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
+ return 0;
+
+ c->lpt_cnext = first_dirty_cnode(c->nroot);
+ cnode = c->lpt_cnext;
+ if (!cnode)
+ return 0;
+ cnt += 1;
+ while (1) {
+ ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
+ __set_bit(COW_ZNODE, &cnode->flags);
+ cnext = next_dirty_cnode(cnode);
+ if (!cnext) {
+ cnode->cnext = c->lpt_cnext;
+ break;
+ }
+ cnode->cnext = cnext;
+ cnode = cnext;
+ cnt += 1;
+ }
+ dbg_cmt("committing %d cnodes", cnt);
+ dbg_lp("committing %d cnodes", cnt);
+ ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
+ return cnt;
+}
+
+/**
+ * upd_ltab - update LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ */
+static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+ dbg_lp("LEB %d free %d dirty %d to %d +%d",
+ lnum, c->ltab[lnum - c->lpt_first].free,
+ c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+ ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+ c->ltab[lnum - c->lpt_first].free = free;
+ c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * alloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function finds the next empty LEB in the ltab starting from @lnum. If a
+ * an empty LEB is found it is returned in @lnum and the function returns %0.
+ * Otherwise the function returns -ENOSPC. Note however, that LPT is designed
+ * never to run out of space.
+ */
+static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+ int i, n;
+
+ n = *lnum - c->lpt_first + 1;
+ for (i = n; i < c->lpt_lebs; i++) {
+ if (c->ltab[i].tgc || c->ltab[i].cmt)
+ continue;
+ if (c->ltab[i].free == c->leb_size) {
+ c->ltab[i].cmt = 1;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ if (c->ltab[i].tgc || c->ltab[i].cmt)
+ continue;
+ if (c->ltab[i].free == c->leb_size) {
+ c->ltab[i].cmt = 1;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+ }
+ dbg_err("last LEB %d", *lnum);
+ dump_stack();
+ return -ENOSPC;
+}
+
+/**
+ * layout_cnodes - layout cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_cnodes(struct ubifs_info *c)
+{
+ int lnum, offs, len, alen, done_lsave, done_ltab, err;
+ struct ubifs_cnode *cnode;
+
+ cnode = c->lpt_cnext;
+ if (!cnode)
+ return 0;
+ lnum = c->nhead_lnum;
+ offs = c->nhead_offs;
+ /* Try to place lsave and ltab nicely */
+ done_lsave = !c->big_lpt;
+ done_ltab = 0;
+ if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+ done_lsave = 1;
+ c->lsave_lnum = lnum;
+ c->lsave_offs = offs;
+ offs += c->lsave_sz;
+ }
+
+ if (offs + c->ltab_sz <= c->leb_size) {
+ done_ltab = 1;
+ c->ltab_lnum = lnum;
+ c->ltab_offs = offs;
+ offs += c->ltab_sz;
+ }
+
+ do {
+ if (cnode->level) {
+ len = c->nnode_sz;
+ c->dirty_nn_cnt -= 1;
+ } else {
+ len = c->pnode_sz;
+ c->dirty_pn_cnt -= 1;
+ }
+ while (offs + len > c->leb_size) {
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ err = alloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ /* Try to place lsave and ltab nicely */
+ if (!done_lsave) {
+ done_lsave = 1;
+ c->lsave_lnum = lnum;
+ c->lsave_offs = offs;
+ offs += c->lsave_sz;
+ continue;
+ }
+ if (!done_ltab) {
+ done_ltab = 1;
+ c->ltab_lnum = lnum;
+ c->ltab_offs = offs;
+ offs += c->ltab_sz;
+ continue;
+ }
+ break;
+ }
+ if (cnode->parent) {
+ cnode->parent->nbranch[cnode->iip].lnum = lnum;
+ cnode->parent->nbranch[cnode->iip].offs = offs;
+ } else {
+ c->lpt_lnum = lnum;
+ c->lpt_offs = offs;
+ }
+ offs += len;
+ cnode = cnode->cnext;
+ } while (cnode && cnode != c->lpt_cnext);
+
+ /* Make sure to place LPT's save table */
+ if (!done_lsave) {
+ if (offs + c->lsave_sz > c->leb_size) {
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ err = alloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ }
+ done_lsave = 1;
+ c->lsave_lnum = lnum;
+ c->lsave_offs = offs;
+ offs += c->lsave_sz;
+ }
+
+ /* Make sure to place LPT's own lprops table */
+ if (!done_ltab) {
+ if (offs + c->ltab_sz > c->leb_size) {
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ err = alloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ }
+ done_ltab = 1;
+ c->ltab_lnum = lnum;
+ c->ltab_offs = offs;
+ offs += c->ltab_sz;
+ }
+
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ return 0;
+}
+
+/**
+ * realloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function duplicates exactly the results of the function alloc_lpt_leb.
+ * It is used during end commit to reallocate the same LEB numbers that were
+ * allocated by alloc_lpt_leb during start commit.
+ *
+ * This function finds the next LEB that was allocated by the alloc_lpt_leb
+ * function starting from @lnum. If a LEB is found it is returned in @lnum and
+ * the function returns %0. Otherwise the function returns -ENOSPC.
+ * Note however, that LPT is designed never to run out of space.
+ */
+static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+ int i, n;
+
+ n = *lnum - c->lpt_first + 1;
+ for (i = n; i < c->lpt_lebs; i++)
+ if (c->ltab[i].cmt) {
+ c->ltab[i].cmt = 0;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+
+ for (i = 0; i < n; i++)
+ if (c->ltab[i].cmt) {
+ c->ltab[i].cmt = 0;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+ dbg_err("last LEB %d", *lnum);
+ dump_stack();
+ return -ENOSPC;
+}
+
+/**
+ * write_cnodes - write cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_cnodes(struct ubifs_info *c)
+{
+ int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
+ struct ubifs_cnode *cnode;
+ void *buf = c->lpt_buf;
+
+ cnode = c->lpt_cnext;
+ if (!cnode)
+ return 0;
+ lnum = c->nhead_lnum;
+ offs = c->nhead_offs;
+ from = offs;
+ /* Ensure empty LEB is unmapped */
+ if (offs == 0) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ /* Try to place lsave and ltab nicely */
+ done_lsave = !c->big_lpt;
+ done_ltab = 0;
+ if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+ done_lsave = 1;
+ ubifs_pack_lsave(c, buf + offs, c->lsave);
+ offs += c->lsave_sz;
+ }
+
+ if (offs + c->ltab_sz <= c->leb_size) {
+ done_ltab = 1;
+ ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+ offs += c->ltab_sz;
+ }
+
+ /* Loop for each cnode */
+ do {
+ if (cnode->level)
+ len = c->nnode_sz;
+ else
+ len = c->pnode_sz;
+ while (offs + len > c->leb_size) {
+ wlen = offs - from;
+ if (wlen) {
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from,
+ alen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ }
+ err = realloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ from = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ /* Try to place lsave and ltab nicely */
+ if (!done_lsave) {
+ done_lsave = 1;
+ ubifs_pack_lsave(c, buf + offs, c->lsave);
+ offs += c->lsave_sz;
+ continue;
+ }
+ if (!done_ltab) {
+ done_ltab = 1;
+ ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+ offs += c->ltab_sz;
+ continue;
+ }
+ break;
+ }
+ if (cnode->level)
+ ubifs_pack_nnode(c, buf + offs,
+ (struct ubifs_nnode *)cnode);
+ else
+ ubifs_pack_pnode(c, buf + offs,
+ (struct ubifs_pnode *)cnode);
+ /*
+ * The reason for the barriers is the same as in case of TNC.
+ * See comment in 'write_index()'. 'dirty_cow_nnode()' and
+ * 'dirty_cow_pnode()' are the functions for which this is
+ * important.
+ */
+ clear_bit(DIRTY_CNODE, &cnode->flags);
+ smp_mb__before_clear_bit();
+ clear_bit(COW_ZNODE, &cnode->flags);
+ smp_mb__after_clear_bit();
+ offs += len;
+ cnode = cnode->cnext;
+ } while (cnode && cnode != c->lpt_cnext);
+
+ /* Make sure to place LPT's save table */
+ if (!done_lsave) {
+ if (offs + c->lsave_sz > c->leb_size) {
+ wlen = offs - from;
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ err = realloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ done_lsave = 1;
+ ubifs_pack_lsave(c, buf + offs, c->lsave);
+ offs += c->lsave_sz;
+ }
+
+ /* Make sure to place LPT's own lprops table */
+ if (!done_ltab) {
+ if (offs + c->ltab_sz > c->leb_size) {
+ wlen = offs - from;
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ err = realloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ done_ltab = 1;
+ ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+ offs += c->ltab_sz;
+ }
+
+ /* Write remaining data in buffer */
+ wlen = offs - from;
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ c->nhead_lnum = lnum;
+ c->nhead_offs = ALIGN(offs, c->min_io_size);
+
+ dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+ dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+ dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+ return 0;
+}
+
+/**
+ * next_pnode - find next pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * This function returns the next pnode or %NULL if there are no more pnodes.
+ */
+static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
+ struct ubifs_pnode *pnode)
+{
+ struct ubifs_nnode *nnode;
+ int iip;
+
+ /* Try to go right */
+ nnode = pnode->parent;
+ iip = pnode->iip + 1;
+ if (iip < UBIFS_LPT_FANOUT) {
+ /* We assume here that LEB zero is never an LPT LEB */
+ if (nnode->nbranch[iip].lnum)
+ return ubifs_get_pnode(c, nnode, iip);
+ else
+ return NULL;
+ }
+
+ /* Go up while can't go right */
+ do {
+ iip = nnode->iip + 1;
+ nnode = nnode->parent;
+ if (!nnode)
+ return NULL;
+ /* We assume here that LEB zero is never an LPT LEB */
+ } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+
+ /* Go right */
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return (void *)nnode;
+
+ /* Go down to level 1 */
+ while (nnode->level > 1) {
+ nnode = ubifs_get_nnode(c, nnode, 0);
+ if (IS_ERR(nnode))
+ return (void *)nnode;
+ }
+
+ return ubifs_get_pnode(c, nnode, 0);
+}
+
+/**
+ * pnode_lookup - lookup a pnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: pnode number (0 to main_lebs - 1)
+ *
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
+{
+ int err, h, iip, shft;
+ struct ubifs_nnode *nnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ i <<= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = c->nroot;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+ c->pnode_sz);
+}
+
+/**
+ * do_make_pnode_dirty - mark a pnode dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to mark dirty
+ */
+static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ /* Assumes cnext list is empty i.e. not called during commit */
+ if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+ struct ubifs_nnode *nnode;
+
+ c->dirty_pn_cnt += 1;
+ add_pnode_dirt(c, pnode);
+ /* Mark parent and ancestors dirty too */
+ nnode = pnode->parent;
+ while (nnode) {
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ nnode = nnode->parent;
+ } else
+ break;
+ }
+ }
+}
+
+/**
+ * make_tree_dirty - mark the entire LEB properties tree dirty.
+ * @c: UBIFS file-system description object
+ *
+ * This function is used by the "small" LPT model to cause the entire LEB
+ * properties tree to be written. The "small" LPT model does not use LPT
+ * garbage collection because it is more efficient to write the entire tree
+ * (because it is small).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_tree_dirty(struct ubifs_info *c)
+{
+ struct ubifs_pnode *pnode;
+
+ pnode = pnode_lookup(c, 0);
+ while (pnode) {
+ do_make_pnode_dirty(c, pnode);
+ pnode = next_pnode(c, pnode);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ }
+ return 0;
+}
+
+/**
+ * need_write_all - determine if the LPT area is running out of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %1 if the LPT area is running out of free space and %0
+ * if it is not.
+ */
+static int need_write_all(struct ubifs_info *c)
+{
+ long long free = 0;
+ int i;
+
+ for (i = 0; i < c->lpt_lebs; i++) {
+ if (i + c->lpt_first == c->nhead_lnum)
+ free += c->leb_size - c->nhead_offs;
+ else if (c->ltab[i].free == c->leb_size)
+ free += c->leb_size;
+ else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+ free += c->leb_size;
+ }
+ /* Less than twice the size left */
+ if (free <= c->lpt_sz * 2)
+ return 1;
+ return 0;
+}
+
+/**
+ * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called during start commit to mark LPT LEBs for trivial GC.
+ */
+static void lpt_tgc_start(struct ubifs_info *c)
+{
+ int i;
+
+ for (i = 0; i < c->lpt_lebs; i++) {
+ if (i + c->lpt_first == c->nhead_lnum)
+ continue;
+ if (c->ltab[i].dirty > 0 &&
+ c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
+ c->ltab[i].tgc = 1;
+ c->ltab[i].free = c->leb_size;
+ c->ltab[i].dirty = 0;
+ dbg_lp("LEB %d", i + c->lpt_first);
+ }
+ }
+}
+
+/**
+ * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called after the commit is completed (master node has been
+ * written) and unmaps LPT LEBs that were marked for trivial GC.
+ */
+static int lpt_tgc_end(struct ubifs_info *c)
+{
+ int i, err;
+
+ for (i = 0; i < c->lpt_lebs; i++)
+ if (c->ltab[i].tgc) {
+ err = ubifs_leb_unmap(c, i + c->lpt_first);
+ if (err)
+ return err;
+ c->ltab[i].tgc = 0;
+ dbg_lp("LEB %d", i + c->lpt_first);
+ }
+ return 0;
+}
+
+/**
+ * populate_lsave - fill the lsave array with important LEB numbers.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is only called for the "big" model. It records a small number
+ * of LEB numbers of important LEBs. Important LEBs are ones that are (from
+ * most important to least important): empty, freeable, freeable index, dirty
+ * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
+ * their pnodes into memory. That will stop us from having to scan the LPT
+ * straight away. For the "small" model we assume that scanning the LPT is no
+ * big deal.
+ */
+static void populate_lsave(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ int i, cnt = 0;
+
+ ubifs_assert(c->big_lpt);
+ if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+ c->lpt_drty_flgs |= LSAVE_DIRTY;
+ ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+ }
+ list_for_each_entry(lprops, &c->empty_list, list) {
+ c->lsave[cnt++] = lprops->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ list_for_each_entry(lprops, &c->freeable_list, list) {
+ c->lsave[cnt++] = lprops->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+ c->lsave[cnt++] = lprops->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ c->lsave[cnt++] = heap->arr[i]->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ c->lsave[cnt++] = heap->arr[i]->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ c->lsave[cnt++] = heap->arr[i]->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ /* Fill it up completely */
+ while (cnt < c->lsave_cnt)
+ c->lsave[cnt++] = c->main_first;
+}
+
+/**
+ * nnode_lookup - lookup a nnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: nnode number
+ *
+ * This function returns a pointer to the nnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
+{
+ int err, iip;
+ struct ubifs_nnode *nnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ nnode = c->nroot;
+ while (1) {
+ iip = i & (UBIFS_LPT_FANOUT - 1);
+ i >>= UBIFS_LPT_FANOUT_SHIFT;
+ if (!i)
+ break;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return nnode;
+ }
+ return nnode;
+}
+
+/**
+ * make_nnode_dirty - find a nnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: nnode number of nnode to make dirty
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+ int offs)
+{
+ struct ubifs_nnode *nnode;
+
+ nnode = nnode_lookup(c, node_num);
+ if (IS_ERR(nnode))
+ return PTR_ERR(nnode);
+ if (nnode->parent) {
+ struct ubifs_nbranch *branch;
+
+ branch = &nnode->parent->nbranch[nnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ return 0; /* nnode is obsolete */
+ } else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+ return 0; /* nnode is obsolete */
+ /* Assumes cnext list is empty i.e. not called during commit */
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ /* Mark parent and ancestors dirty too */
+ nnode = nnode->parent;
+ while (nnode) {
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ nnode = nnode->parent;
+ } else
+ break;
+ }
+ }
+ return 0;
+}
+
+/**
+ * make_pnode_dirty - find a pnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: pnode number of pnode to make dirty
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+ int offs)
+{
+ struct ubifs_pnode *pnode;
+ struct ubifs_nbranch *branch;
+
+ pnode = pnode_lookup(c, node_num);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ branch = &pnode->parent->nbranch[pnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ return 0;
+ do_make_pnode_dirty(c, pnode);
+ return 0;
+}
+
+/**
+ * make_ltab_dirty - make ltab node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where ltab was written
+ * @offs: offset where ltab was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+ return 0; /* This ltab node is obsolete */
+ if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+ c->lpt_drty_flgs |= LTAB_DIRTY;
+ ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+ }
+ return 0;
+}
+
+/**
+ * make_lsave_dirty - make lsave node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where lsave was written
+ * @offs: offset where lsave was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+ return 0; /* This lsave node is obsolete */
+ if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+ c->lpt_drty_flgs |= LSAVE_DIRTY;
+ ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+ }
+ return 0;
+}
+
+/**
+ * make_node_dirty - make node dirty.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ * @node_num: node number
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
+ int lnum, int offs)
+{
+ switch (node_type) {
+ case UBIFS_LPT_NNODE:
+ return make_nnode_dirty(c, node_num, lnum, offs);
+ case UBIFS_LPT_PNODE:
+ return make_pnode_dirty(c, node_num, lnum, offs);
+ case UBIFS_LPT_LTAB:
+ return make_ltab_dirty(c, lnum, offs);
+ case UBIFS_LPT_LSAVE:
+ return make_lsave_dirty(c, lnum, offs);
+ }
+ return -EINVAL;
+}
+
+/**
+ * get_lpt_node_len - return the length of a node based on its type.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ */
+static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+{
+ switch (node_type) {
+ case UBIFS_LPT_NNODE:
+ return c->nnode_sz;
+ case UBIFS_LPT_PNODE:
+ return c->pnode_sz;
+ case UBIFS_LPT_LTAB:
+ return c->ltab_sz;
+ case UBIFS_LPT_LSAVE:
+ return c->lsave_sz;
+ }
+ return 0;
+}
+
+/**
+ * get_pad_len - return the length of padding in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ */
+static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+{
+ int offs, pad_len;
+
+ if (c->min_io_size == 1)
+ return 0;
+ offs = c->leb_size - len;
+ pad_len = ALIGN(offs, c->min_io_size) - offs;
+ return pad_len;
+}
+
+/**
+ * get_lpt_node_type - return type (and node number) of a node in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @node_num: node number is returned here
+ */
+static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int pos = 0, node_type;
+
+ node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+ *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+ return node_type;
+}
+
+/**
+ * is_a_node - determine if a buffer contains a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer contains a node or %0 if it does not.
+ */
+static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int pos = 0, node_type, node_len;
+ uint16_t crc, calc_crc;
+
+ node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+ if (node_type == UBIFS_LPT_NOT_A_NODE)
+ return 0;
+ node_len = get_lpt_node_len(c, node_type);
+ if (!node_len || node_len > len)
+ return 0;
+ pos = 0;
+ addr = buf;
+ crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+ calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ node_len - UBIFS_LPT_CRC_BYTES);
+ if (crc != calc_crc)
+ return 0;
+ return 1;
+}
+
+
+/**
+ * lpt_gc_lnum - garbage collect a LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to garbage collect
+ *
+ * LPT garbage collection is used only for the "big" LPT model
+ * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes
+ * in the LEB being garbage-collected as dirty. The dirty nodes are written
+ * next commit, after which the LEB is free to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
+{
+ int err, len = c->leb_size, node_type, node_num, node_len, offs;
+ void *buf = c->lpt_buf;
+
+ dbg_lp("LEB %d", lnum);
+ err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+ if (err) {
+ ubifs_err("cannot read LEB %d, error %d", lnum, err);
+ return err;
+ }
+ while (1) {
+ if (!is_a_node(c, buf, len)) {
+ int pad_len;
+
+ pad_len = get_pad_len(c, buf, len);
+ if (pad_len) {
+ buf += pad_len;
+ len -= pad_len;
+ continue;
+ }
+ return 0;
+ }
+ node_type = get_lpt_node_type(c, buf, &node_num);
+ node_len = get_lpt_node_len(c, node_type);
+ offs = c->leb_size - len;
+ ubifs_assert(node_len != 0);
+ mutex_lock(&c->lp_mutex);
+ err = make_node_dirty(c, node_type, node_num, lnum, offs);
+ mutex_unlock(&c->lp_mutex);
+ if (err)
+ return err;
+ buf += node_len;
+ len -= node_len;
+ }
+ return 0;
+}
+
+/**
+ * lpt_gc - LPT garbage collection.
+ * @c: UBIFS file-system description object
+ *
+ * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
+ * Returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc(struct ubifs_info *c)
+{
+ int i, lnum = -1, dirty = 0;
+
+ mutex_lock(&c->lp_mutex);
+ for (i = 0; i < c->lpt_lebs; i++) {
+ ubifs_assert(!c->ltab[i].tgc);
+ if (i + c->lpt_first == c->nhead_lnum ||
+ c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+ continue;
+ if (c->ltab[i].dirty > dirty) {
+ dirty = c->ltab[i].dirty;
+ lnum = i + c->lpt_first;
+ }
+ }
+ mutex_unlock(&c->lp_mutex);
+ if (lnum == -1)
+ return -ENOSPC;
+ return lpt_gc_lnum(c, lnum);
+}
+
+/**
+ * ubifs_lpt_start_commit - UBIFS commit starts.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when UBIFS starts the commit operation.
+ * This function "freezes" all currently dirty LEB properties and does not
+ * change them anymore. Further changes are saved and tracked separately
+ * because they are not part of this commit. This function returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+int ubifs_lpt_start_commit(struct ubifs_info *c)
+{
+ int err, cnt;
+
+ dbg_lp("");
+
+ mutex_lock(&c->lp_mutex);
+ err = dbg_check_ltab(c);
+ if (err)
+ goto out;
+
+ if (c->check_lpt_free) {
+ /*
+ * We ensure there is enough free space in
+ * ubifs_lpt_post_commit() by marking nodes dirty. That
+ * information is lost when we unmount, so we also need
+ * to check free space once after mounting also.
+ */
+ c->check_lpt_free = 0;
+ while (need_write_all(c)) {
+ mutex_unlock(&c->lp_mutex);
+ err = lpt_gc(c);
+ if (err)
+ return err;
+ mutex_lock(&c->lp_mutex);
+ }
+ }
+
+ lpt_tgc_start(c);
+
+ if (!c->dirty_pn_cnt) {
+ dbg_cmt("no cnodes to commit");
+ err = 0;
+ goto out;
+ }
+
+ if (!c->big_lpt && need_write_all(c)) {
+ /* If needed, write everything */
+ err = make_tree_dirty(c);
+ if (err)
+ goto out;
+ lpt_tgc_start(c);
+ }
+
+ if (c->big_lpt)
+ populate_lsave(c);
+
+ cnt = get_cnodes_to_commit(c);
+ ubifs_assert(cnt != 0);
+
+ err = layout_cnodes(c);
+ if (err)
+ goto out;
+
+ /* Copy the LPT's own lprops for end commit to write */
+ memcpy(c->ltab_cmt, c->ltab,
+ sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
+
+out:
+ mutex_unlock(&c->lp_mutex);
+ return err;
+}
+
+/**
+ * free_obsolete_cnodes - free obsolete cnodes for commit end.
+ * @c: UBIFS file-system description object
+ */
+static void free_obsolete_cnodes(struct ubifs_info *c)
+{
+ struct ubifs_cnode *cnode, *cnext;
+
+ cnext = c->lpt_cnext;
+ if (!cnext)
+ return;
+ do {
+ cnode = cnext;
+ cnext = cnode->cnext;
+ if (test_bit(OBSOLETE_CNODE, &cnode->flags))
+ kfree(cnode);
+ else
+ cnode->cnext = NULL;
+ } while (cnext != c->lpt_cnext);
+ c->lpt_cnext = NULL;
+}
+
+/**
+ * ubifs_lpt_end_commit - finish the commit operation.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when the commit operation finishes. It
+ * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
+ * the media. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubifs_lpt_end_commit(struct ubifs_info *c)
+{
+ int err;
+
+ dbg_lp("");
+
+ if (!c->lpt_cnext)
+ return 0;
+
+ err = write_cnodes(c);
+ if (err)
+ return err;
+
+ mutex_lock(&c->lp_mutex);
+ free_obsolete_cnodes(c);
+ mutex_unlock(&c->lp_mutex);
+
+ return 0;
+}
+
+/**
+ * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial GC is completed after a commit. Also LPT GC is done after a
+ * commit for the "big" LPT model.
+ */
+int ubifs_lpt_post_commit(struct ubifs_info *c)
+{
+ int err;
+
+ mutex_lock(&c->lp_mutex);
+ err = lpt_tgc_end(c);
+ if (err)
+ goto out;
+ if (c->big_lpt)
+ while (need_write_all(c)) {
+ mutex_unlock(&c->lp_mutex);
+ err = lpt_gc(c);
+ if (err)
+ return err;
+ mutex_lock(&c->lp_mutex);
+ }
+out:
+ mutex_unlock(&c->lp_mutex);
+ return err;
+}
+
+/**
+ * first_nnode - find the first nnode in memory.
+ * @c: UBIFS file-system description object
+ * @hght: height of tree where nnode found is returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
+{
+ struct ubifs_nnode *nnode;
+ int h, i, found;
+
+ nnode = c->nroot;
+ *hght = 0;
+ if (!nnode)
+ return NULL;
+ for (h = 1; h < c->lpt_hght; h++) {
+ found = 0;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (nnode->nbranch[i].nnode) {
+ found = 1;
+ nnode = nnode->nbranch[i].nnode;
+ *hght = h;
+ break;
+ }
+ }
+ if (!found)
+ break;
+ }
+ return nnode;
+}
+
+/**
+ * next_nnode - find the next nnode in memory.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode from which to start.
+ * @hght: height of tree where nnode is, is passed and returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *nnode, int *hght)
+{
+ struct ubifs_nnode *parent;
+ int iip, h, i, found;
+
+ parent = nnode->parent;
+ if (!parent)
+ return NULL;
+ if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
+ *hght -= 1;
+ return parent;
+ }
+ for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+ nnode = parent->nbranch[iip].nnode;
+ if (nnode)
+ break;
+ }
+ if (!nnode) {
+ *hght -= 1;
+ return parent;
+ }
+ for (h = *hght + 1; h < c->lpt_hght; h++) {
+ found = 0;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (nnode->nbranch[i].nnode) {
+ found = 1;
+ nnode = nnode->nbranch[i].nnode;
+ *hght = h;
+ break;
+ }
+ }
+ if (!found)
+ break;
+ }
+ return nnode;
+}
+
+/**
+ * ubifs_lpt_free - free resources owned by the LPT.
+ * @c: UBIFS file-system description object
+ * @wr_only: free only resources used for writing
+ */
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
+{
+ struct ubifs_nnode *nnode;
+ int i, hght;
+
+ /* Free write-only things first */
+
+ free_obsolete_cnodes(c); /* Leftover from a failed commit */
+
+ vfree(c->ltab_cmt);
+ c->ltab_cmt = NULL;
+ vfree(c->lpt_buf);
+ c->lpt_buf = NULL;
+ kfree(c->lsave);
+ c->lsave = NULL;
+
+ if (wr_only)
+ return;
+
+ /* Now free the rest */
+
+ nnode = first_nnode(c, &hght);
+ while (nnode) {
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++)
+ kfree(nnode->nbranch[i].nnode);
+ nnode = next_nnode(c, nnode, &hght);
+ }
+ for (i = 0; i < LPROPS_HEAP_CNT; i++)
+ kfree(c->lpt_heap[i].arr);
+ kfree(c->dirty_idx.arr);
+ kfree(c->nroot);
+ vfree(c->ltab);
+ kfree(c->lpt_nod_buf);
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * @buf: buffer
+ * @len: buffer length
+ */
+static int dbg_is_all_ff(uint8_t *buf, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ if (buf[i] != 0xff)
+ return 0;
+ return 1;
+}
+
+/**
+ * dbg_is_nnode_dirty - determine if a nnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ */
+static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ struct ubifs_nnode *nnode;
+ int hght;
+
+ /* Entire tree is in memory so first_nnode / next_nnode are ok */
+ nnode = first_nnode(c, &hght);
+ for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
+ struct ubifs_nbranch *branch;
+
+ cond_resched();
+ if (nnode->parent) {
+ branch = &nnode->parent->nbranch[nnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ continue;
+ if (test_bit(DIRTY_CNODE, &nnode->flags))
+ return 1;
+ return 0;
+ } else {
+ if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+ continue;
+ if (test_bit(DIRTY_CNODE, &nnode->flags))
+ return 1;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/**
+ * dbg_is_pnode_dirty - determine if a pnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ */
+static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ int i, cnt;
+
+ cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+ for (i = 0; i < cnt; i++) {
+ struct ubifs_pnode *pnode;
+ struct ubifs_nbranch *branch;
+
+ cond_resched();
+ pnode = pnode_lookup(c, i);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ branch = &pnode->parent->nbranch[pnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ continue;
+ if (test_bit(DIRTY_CNODE, &pnode->flags))
+ return 1;
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * dbg_is_ltab_dirty - determine if a ltab node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where ltab node was written
+ * @offs: offset where ltab node was written
+ */
+static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+ return 1;
+ return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_lsave_dirty - determine if a lsave node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where lsave node was written
+ * @offs: offset where lsave node was written
+ */
+static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+ return 1;
+ return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_node_dirty - determine if a node is dirty.
+ * @c: the UBIFS file-system description object
+ * @node_type: node type
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ */
+static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
+ int offs)
+{
+ switch (node_type) {
+ case UBIFS_LPT_NNODE:
+ return dbg_is_nnode_dirty(c, lnum, offs);
+ case UBIFS_LPT_PNODE:
+ return dbg_is_pnode_dirty(c, lnum, offs);
+ case UBIFS_LPT_LTAB:
+ return dbg_is_ltab_dirty(c, lnum, offs);
+ case UBIFS_LPT_LSAVE:
+ return dbg_is_lsave_dirty(c, lnum, offs);
+ }
+ return 1;
+}
+
+/**
+ * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
+{
+ int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
+ int ret;
+ void *buf = c->dbg_buf;
+
+ dbg_lp("LEB %d", lnum);
+ err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+ if (err) {
+ dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
+ return err;
+ }
+ while (1) {
+ if (!is_a_node(c, buf, len)) {
+ int i, pad_len;
+
+ pad_len = get_pad_len(c, buf, len);
+ if (pad_len) {
+ buf += pad_len;
+ len -= pad_len;
+ dirty += pad_len;
+ continue;
+ }
+ if (!dbg_is_all_ff(buf, len)) {
+ dbg_msg("invalid empty space in LEB %d at %d",
+ lnum, c->leb_size - len);
+ err = -EINVAL;
+ }
+ i = lnum - c->lpt_first;
+ if (len != c->ltab[i].free) {
+ dbg_msg("invalid free space in LEB %d "
+ "(free %d, expected %d)",
+ lnum, len, c->ltab[i].free);
+ err = -EINVAL;
+ }
+ if (dirty != c->ltab[i].dirty) {
+ dbg_msg("invalid dirty space in LEB %d "
+ "(dirty %d, expected %d)",
+ lnum, dirty, c->ltab[i].dirty);
+ err = -EINVAL;
+ }
+ return err;
+ }
+ node_type = get_lpt_node_type(c, buf, &node_num);
+ node_len = get_lpt_node_len(c, node_type);
+ ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
+ if (ret == 1)
+ dirty += node_len;
+ buf += node_len;
+ len -= node_len;
+ }
+}
+
+/**
+ * dbg_check_ltab - check the free and dirty space in the ltab.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_ltab(struct ubifs_info *c)
+{
+ int lnum, err, i, cnt;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
+ /* Bring the entire tree into memory */
+ cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+ for (i = 0; i < cnt; i++) {
+ struct ubifs_pnode *pnode;
+
+ pnode = pnode_lookup(c, i);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ cond_resched();
+ }
+
+ /* Check nodes */
+ err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
+ if (err)
+ return err;
+
+ /* Check each LEB */
+ for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+ err = dbg_check_ltab_lnum(c, lnum);
+ if (err) {
+ dbg_err("failed at LEB %d", lnum);
+ return err;
+ }
+ }
+
+ dbg_lp("succeeded");
+ return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 00000000000..71d5493bf56
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/* This file implements reading and writing the master node */
+
+#include "ubifs.h"
+
+/**
+ * scan_for_master - search the valid master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the master node LEBs and search for the latest master
+ * node. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int scan_for_master(struct ubifs_info *c)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ int lnum, offs = 0, nodes_cnt;
+
+ lnum = UBIFS_MST_LNUM;
+
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ nodes_cnt = sleb->nodes_cnt;
+ if (nodes_cnt > 0) {
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+ list);
+ if (snod->type != UBIFS_MST_NODE)
+ goto out;
+ memcpy(c->mst_node, snod->node, snod->len);
+ offs = snod->offs;
+ }
+ ubifs_scan_destroy(sleb);
+
+ lnum += 1;
+
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ if (sleb->nodes_cnt != nodes_cnt)
+ goto out;
+ if (!sleb->nodes_cnt)
+ goto out;
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
+ if (snod->type != UBIFS_MST_NODE)
+ goto out;
+ if (snod->offs != offs)
+ goto out;
+ if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
+ (void *)snod->node + UBIFS_CH_SZ,
+ UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+ goto out;
+ c->mst_offs = offs;
+ ubifs_scan_destroy(sleb);
+ return 0;
+
+out:
+ ubifs_scan_destroy(sleb);
+ return -EINVAL;
+}
+
+/**
+ * validate_master - validate master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function validates data which was read from master node. Returns zero
+ * if the data is all right and %-EINVAL if not.
+ */
+static int validate_master(const struct ubifs_info *c)
+{
+ long long main_sz;
+ int err;
+
+ if (c->max_sqnum >= SQNUM_WATERMARK) {
+ err = 1;
+ goto out;
+ }
+
+ if (c->cmt_no >= c->max_sqnum) {
+ err = 2;
+ goto out;
+ }
+
+ if (c->highest_inum >= INUM_WATERMARK) {
+ err = 3;
+ goto out;
+ }
+
+ if (c->lhead_lnum < UBIFS_LOG_LNUM ||
+ c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
+ c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
+ c->lhead_offs & (c->min_io_size - 1)) {
+ err = 4;
+ goto out;
+ }
+
+ if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
+ c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
+ err = 5;
+ goto out;
+ }
+
+ if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
+ c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
+ err = 6;
+ goto out;
+ }
+
+ if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
+ err = 7;
+ goto out;
+ }
+
+ if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
+ c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
+ c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
+ err = 8;
+ goto out;
+ }
+
+ main_sz = (long long)c->main_lebs * c->leb_size;
+ if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+ err = 9;
+ goto out;
+ }
+
+ if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
+ c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
+ err = 10;
+ goto out;
+ }
+
+ if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
+ c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
+ c->nhead_offs > c->leb_size) {
+ err = 11;
+ goto out;
+ }
+
+ if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
+ c->ltab_offs < 0 ||
+ c->ltab_offs + c->ltab_sz > c->leb_size) {
+ err = 12;
+ goto out;
+ }
+
+ if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
+ c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
+ c->lsave_offs + c->lsave_sz > c->leb_size)) {
+ err = 13;
+ goto out;
+ }
+
+ if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
+ err = 14;
+ goto out;
+ }
+
+ if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
+ err = 15;
+ goto out;
+ }
+
+ if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
+ err = 16;
+ goto out;
+ }
+
+ if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
+ c->lst.total_free & 7) {
+ err = 17;
+ goto out;
+ }
+
+ if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
+ err = 18;
+ goto out;
+ }
+
+ if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
+ err = 19;
+ goto out;
+ }
+
+ if (c->lst.total_free + c->lst.total_dirty +
+ c->lst.total_used > main_sz) {
+ err = 20;
+ goto out;
+ }
+
+ if (c->lst.total_dead + c->lst.total_dark +
+ c->lst.total_used + c->old_idx_sz > main_sz) {
+ err = 21;
+ goto out;
+ }
+
+ if (c->lst.total_dead < 0 ||
+ c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
+ c->lst.total_dead & 7) {
+ err = 22;
+ goto out;
+ }
+
+ if (c->lst.total_dark < 0 ||
+ c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
+ c->lst.total_dark & 7) {
+ err = 23;
+ goto out;
+ }
+
+ return 0;
+
+out:
+ ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
+ dbg_dump_node(c, c->mst_node);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_read_master - read master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds and reads the master node during file-system mount. If
+ * the flash is empty, it creates default master node as well. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+int ubifs_read_master(struct ubifs_info *c)
+{
+ int err, old_leb_cnt;
+
+ c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+ if (!c->mst_node)
+ return -ENOMEM;
+
+ err = scan_for_master(c);
+ if (err) {
+ err = ubifs_recover_master_node(c);
+ if (err)
+ /*
+ * Note, we do not free 'c->mst_node' here because the
+ * unmount routine will take care of this.
+ */
+ return err;
+ }
+
+ /* Make sure that the recovery flag is clear */
+ c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
+
+ c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum);
+ c->highest_inum = le64_to_cpu(c->mst_node->highest_inum);
+ c->cmt_no = le64_to_cpu(c->mst_node->cmt_no);
+ c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum);
+ c->zroot.offs = le32_to_cpu(c->mst_node->root_offs);
+ c->zroot.len = le32_to_cpu(c->mst_node->root_len);
+ c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum);
+ c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
+ c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
+ c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
+ c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);
+ c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
+ c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
+ c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
+ c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs);
+ c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum);
+ c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs);
+ c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum);
+ c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs);
+ c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum);
+ c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs);
+ c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs);
+ old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt);
+ c->lst.total_free = le64_to_cpu(c->mst_node->total_free);
+ c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
+ c->lst.total_used = le64_to_cpu(c->mst_node->total_used);
+ c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
+ c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
+
+ c->calc_idx_sz = c->old_idx_sz;
+
+ if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
+ c->no_orphs = 1;
+
+ if (old_leb_cnt != c->leb_cnt) {
+ /* The file system has been resized */
+ int growth = c->leb_cnt - old_leb_cnt;
+
+ if (c->leb_cnt < old_leb_cnt ||
+ c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+ ubifs_err("bad leb_cnt on master node");
+ dbg_dump_node(c, c->mst_node);
+ return -EINVAL;
+ }
+
+ dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
+ old_leb_cnt, c->leb_cnt);
+ c->lst.empty_lebs += growth;
+ c->lst.total_free += growth * (long long)c->leb_size;
+ c->lst.total_dark += growth * (long long)c->dark_wm;
+
+ /*
+ * Reflect changes back onto the master node. N.B. the master
+ * node gets written immediately whenever mounting (or
+ * remounting) in read-write mode, so we do not need to write it
+ * here.
+ */
+ c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
+ c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
+ c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
+ c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
+ }
+
+ err = validate_master(c);
+ if (err)
+ return err;
+
+ err = dbg_old_index_check_init(c, &c->zroot);
+
+ return err;
+}
+
+/**
+ * ubifs_write_master - write master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node. The caller has to take the
+ * @c->mst_mutex lock before calling this function. Returns zero in case of
+ * success and a negative error code in case of failure. The master node is
+ * written twice to enable recovery.
+ */
+int ubifs_write_master(struct ubifs_info *c)
+{
+ int err, lnum, offs, len;
+
+ if (c->ro_media)
+ return -EINVAL;
+
+ lnum = UBIFS_MST_LNUM;
+ offs = c->mst_offs + c->mst_node_alsz;
+ len = UBIFS_MST_NODE_SZ;
+
+ if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ offs = 0;
+ }
+
+ c->mst_offs = offs;
+ c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
+
+ err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+ if (err)
+ return err;
+
+ lnum += 1;
+
+ if (offs == 0) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+
+ return err;
+}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 00000000000..4beccfc256d
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,342 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file contains miscellaneous helper functions.
+ */
+
+#ifndef __UBIFS_MISC_H__
+#define __UBIFS_MISC_H__
+
+/**
+ * ubifs_zn_dirty - check if znode is dirty.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is dirty and %0 otherwise.
+ */
+static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
+{
+ return !!test_bit(DIRTY_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_wake_up_bgt - wake up background thread.
+ * @c: UBIFS file-system description object
+ */
+static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
+{
+ if (c->bgt && !c->need_bgt) {
+ c->need_bgt = 1;
+ wake_up_process(c->bgt);
+ }
+}
+
+/**
+ * ubifs_tnc_find_child - find next child in znode.
+ * @znode: znode to search at
+ * @start: the zbranch index to start at
+ *
+ * This helper function looks for znode child starting at index @start. Returns
+ * the child or %NULL if no children were found.
+ */
+static inline struct ubifs_znode *
+ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
+{
+ while (start < znode->child_cnt) {
+ if (znode->zbranch[start].znode)
+ return znode->zbranch[start].znode;
+ start += 1;
+ }
+
+ return NULL;
+}
+
+/**
+ * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
+ * @inode: the VFS 'struct inode' pointer
+ */
+static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
+{
+ return container_of(inode, struct ubifs_inode, vfs_inode);
+}
+
+/**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+ if (!c->ro_media) {
+ c->ro_media = 1;
+ ubifs_warn("switched to read-only mode, error %d", err);
+ dbg_dump_stack();
+ }
+}
+
+/**
+ * ubifs_compr_present - check if compressor was compiled in.
+ * @compr_type: compressor type to check
+ *
+ * This function returns %1 of compressor of type @compr_type is present, and
+ * %0 if not.
+ */
+static inline int ubifs_compr_present(int compr_type)
+{
+ ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+ return !!ubifs_compressors[compr_type]->capi_name;
+}
+
+/**
+ * ubifs_compr_name - get compressor name string by its type.
+ * @compr_type: compressor type
+ *
+ * This function returns compressor type string.
+ */
+static inline const char *ubifs_compr_name(int compr_type)
+{
+ ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+ return ubifs_compressors[compr_type]->name;
+}
+
+/**
+ * ubifs_wbuf_sync - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * that the write-buffer is already locked.
+ */
+static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
+{
+ int err;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+}
+
+/**
+ * ubifs_leb_unmap - unmap an LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to unmap
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
+{
+ int err;
+
+ if (c->ro_media)
+ return -EROFS;
+ err = ubi_leb_unmap(c->ubi, lnum);
+ if (err) {
+ ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_leb_write - write to a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @offs: offset within LEB to write to
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
+ const void *buf, int offs, int len, int dtype)
+{
+ int err;
+
+ if (c->ro_media)
+ return -EROFS;
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+ if (err) {
+ ubifs_err("writing %d bytes at %d:%d, error %d",
+ len, lnum, offs, err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_leb_change - atomic LEB change.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
+ const void *buf, int len, int dtype)
+{
+ int err;
+
+ if (c->ro_media)
+ return -EROFS;
+ err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+ if (err) {
+ ubifs_err("changing %d bytes in LEB %d, error %d",
+ len, lnum, err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_encode_dev - encode device node IDs.
+ * @dev: UBIFS device node information
+ * @rdev: device IDs to encode
+ *
+ * This is a helper function which encodes major/minor numbers of a device node
+ * into UBIFS device node description. We use standard Linux "new" and "huge"
+ * encodings.
+ */
+static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
+{
+ if (new_valid_dev(rdev)) {
+ dev->new = cpu_to_le32(new_encode_dev(rdev));
+ return sizeof(dev->new);
+ } else {
+ dev->huge = cpu_to_le64(huge_encode_dev(rdev));
+ return sizeof(dev->huge);
+ }
+}
+
+/**
+ * ubifs_add_dirt - add dirty space to LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to add dirty space for
+ * @dirty: dirty space to add
+ *
+ * This is a helper function which increased amount of dirty LEB space. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+ return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
+}
+
+/**
+ * ubifs_return_leb - return LEB to lprops.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to return
+ *
+ * This helper function cleans the "taken" flag of a logical eraseblock in the
+ * lprops. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
+{
+ return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_TAKEN, 0);
+}
+
+/**
+ * ubifs_idx_node_sz - return index node size.
+ * @c: the UBIFS file-system description object
+ * @child_cnt: number of children of this index node
+ */
+static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
+{
+ return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+}
+
+/**
+ * ubifs_idx_branch - return pointer to an index branch.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ * @bnum: branch number
+ */
+static inline
+struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
+ const struct ubifs_idx_node *idx,
+ int bnum)
+{
+ return (struct ubifs_branch *)((void *)idx->branches +
+ (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+}
+
+/**
+ * ubifs_idx_key - return pointer to an index key.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ */
+static inline void *ubifs_idx_key(const struct ubifs_info *c,
+ const struct ubifs_idx_node *idx)
+{
+ return (void *)((struct ubifs_branch *)idx->branches)->key;
+}
+
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, doubled because we always allow enough
+ * space to write the index twice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+static inline long long ubifs_reported_space(const struct ubifs_info *c,
+ uint64_t free)
+{
+ int divisor, factor;
+
+ divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
+ factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
+ do_div(free, divisor);
+
+ return free * factor;
+}
+
+/**
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
+ */
+static inline struct timespec ubifs_current_time(struct inode *inode)
+{
+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 00000000000..3afeb9242c6
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,958 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Adrian Hunter
+ */
+
+#include "ubifs.h"
+
+/*
+ * An orphan is an inode number whose inode node has been committed to the index
+ * with a link count of zero. That happens when an open file is deleted
+ * (unlinked) and then a commit is run. In the normal course of events the inode
+ * would be deleted when the file is closed. However in the case of an unclean
+ * unmount, orphans need to be accounted for. After an unclean unmount, the
+ * orphans' inodes must be deleted which means either scanning the entire index
+ * looking for them, or keeping a list on flash somewhere. This unit implements
+ * the latter approach.
+ *
+ * The orphan area is a fixed number of LEBs situated between the LPT area and
+ * the main area. The number of orphan area LEBs is specified when the file
+ * system is created. The minimum number is 1. The size of the orphan area
+ * should be so that it can hold the maximum number of orphans that are expected
+ * to ever exist at one time.
+ *
+ * The number of orphans that can fit in a LEB is:
+ *
+ * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
+ *
+ * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
+ *
+ * Orphans are accumulated in a rb-tree. When an inode's link count drops to
+ * zero, the inode number is added to the rb-tree. It is removed from the tree
+ * when the inode is deleted. Any new orphans that are in the orphan tree when
+ * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * If the orphan area is full, it is consolidated to make space. There is
+ * always enough space because validation prevents the user from creating more
+ * than the maximum number of orphans allowed.
+ */
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_orphans(struct ubifs_info *c);
+#else
+#define dbg_check_orphans(c) 0
+#endif
+
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *orphan, *o;
+ struct rb_node **p, *parent = NULL;
+
+ orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
+ if (!orphan)
+ return -ENOMEM;
+ orphan->inum = inum;
+ orphan->new = 1;
+
+ spin_lock(&c->orphan_lock);
+ if (c->tot_orphans >= c->max_orphans) {
+ spin_unlock(&c->orphan_lock);
+ kfree(orphan);
+ return -ENFILE;
+ }
+ p = &c->orph_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = &(*p)->rb_left;
+ else if (inum > o->inum)
+ p = &(*p)->rb_right;
+ else {
+ dbg_err("orphaned twice");
+ spin_unlock(&c->orphan_lock);
+ kfree(orphan);
+ return 0;
+ }
+ }
+ c->tot_orphans += 1;
+ c->new_orphans += 1;
+ rb_link_node(&orphan->rb, parent, p);
+ rb_insert_color(&orphan->rb, &c->orph_tree);
+ list_add_tail(&orphan->list, &c->orph_list);
+ list_add_tail(&orphan->new_list, &c->orph_new);
+ spin_unlock(&c->orphan_lock);
+ dbg_gen("ino %lu", inum);
+ return 0;
+}
+
+/**
+ * ubifs_delete_orphan - delete an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Delete an orphan. This function is called when an inode is deleted.
+ */
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *o;
+ struct rb_node *p;
+
+ spin_lock(&c->orphan_lock);
+ p = c->orph_tree.rb_node;
+ while (p) {
+ o = rb_entry(p, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = p->rb_left;
+ else if (inum > o->inum)
+ p = p->rb_right;
+ else {
+ if (o->dnext) {
+ spin_unlock(&c->orphan_lock);
+ dbg_gen("deleted twice ino %lu", inum);
+ return;
+ }
+ if (o->cnext) {
+ o->dnext = c->orph_dnext;
+ c->orph_dnext = o;
+ spin_unlock(&c->orphan_lock);
+ dbg_gen("delete later ino %lu", inum);
+ return;
+ }
+ rb_erase(p, &c->orph_tree);
+ list_del(&o->list);
+ c->tot_orphans -= 1;
+ if (o->new) {
+ list_del(&o->new_list);
+ c->new_orphans -= 1;
+ }
+ spin_unlock(&c->orphan_lock);
+ kfree(o);
+ dbg_gen("inum %lu", inum);
+ return;
+ }
+ }
+ spin_unlock(&c->orphan_lock);
+ dbg_err("missing orphan ino %lu", inum);
+ dbg_dump_stack();
+}
+
+/**
+ * ubifs_orphan_start_commit - start commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * Start commit of orphans.
+ */
+int ubifs_orphan_start_commit(struct ubifs_info *c)
+{
+ struct ubifs_orphan *orphan, **last;
+
+ spin_lock(&c->orphan_lock);
+ last = &c->orph_cnext;
+ list_for_each_entry(orphan, &c->orph_new, new_list) {
+ ubifs_assert(orphan->new);
+ orphan->new = 0;
+ *last = orphan;
+ last = &orphan->cnext;
+ }
+ *last = orphan->cnext;
+ c->cmt_orphans = c->new_orphans;
+ c->new_orphans = 0;
+ dbg_cmt("%d orphans to commit", c->cmt_orphans);
+ INIT_LIST_HEAD(&c->orph_new);
+ if (c->tot_orphans == 0)
+ c->no_orphs = 1;
+ else
+ c->no_orphs = 0;
+ spin_unlock(&c->orphan_lock);
+ return 0;
+}
+
+/**
+ * avail_orphs - calculate available space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in the
+ * available space.
+ */
+static int avail_orphs(struct ubifs_info *c)
+{
+ int avail_lebs, avail, gap;
+
+ avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
+ avail = avail_lebs *
+ ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+ gap = c->leb_size - c->ohead_offs;
+ if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
+ avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+ return avail;
+}
+
+/**
+ * tot_avail_orphs - calculate total space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in half
+ * the total space. That leaves half the space for adding new orphans.
+ */
+static int tot_avail_orphs(struct ubifs_info *c)
+{
+ int avail_lebs, avail;
+
+ avail_lebs = c->orph_lebs;
+ avail = avail_lebs *
+ ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+ return avail / 2;
+}
+
+/**
+ * do_write_orph_node - write a node
+ * @c: UBIFS file-system description object
+ * @len: length of node
+ * @atomic: write atomically
+ *
+ * This function writes a node to the orphan head from the orphan buffer. If
+ * %atomic is not zero, then the write is done atomically. On success, %0 is
+ * returned, otherwise a negative error code is returned.
+ */
+static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
+{
+ int err = 0;
+
+ if (atomic) {
+ ubifs_assert(c->ohead_offs == 0);
+ ubifs_prepare_node(c, c->orph_buf, len, 1);
+ len = ALIGN(len, c->min_io_size);
+ err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
+ UBI_SHORTTERM);
+ } else {
+ if (c->ohead_offs == 0) {
+ /* Ensure LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->ohead_lnum);
+ if (err)
+ return err;
+ }
+ err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
+ c->ohead_offs, UBI_SHORTTERM);
+ }
+ return err;
+}
+
+/**
+ * write_orph_node - write an orph node
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function builds an orph node from the cnext list and writes it to the
+ * orphan head. On success, %0 is returned, otherwise a negative error code
+ * is returned.
+ */
+static int write_orph_node(struct ubifs_info *c, int atomic)
+{
+ struct ubifs_orphan *orphan, *cnext;
+ struct ubifs_orph_node *orph;
+ int gap, err, len, cnt, i;
+
+ ubifs_assert(c->cmt_orphans > 0);
+ gap = c->leb_size - c->ohead_offs;
+ if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
+ c->ohead_lnum += 1;
+ c->ohead_offs = 0;
+ gap = c->leb_size;
+ if (c->ohead_lnum > c->orph_last) {
+ /*
+ * We limit the number of orphans so that this should
+ * never happen.
+ */
+ ubifs_err("out of space in orphan area");
+ return -EINVAL;
+ }
+ }
+ cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+ if (cnt > c->cmt_orphans)
+ cnt = c->cmt_orphans;
+ len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
+ ubifs_assert(c->orph_buf);
+ orph = c->orph_buf;
+ orph->ch.node_type = UBIFS_ORPH_NODE;
+ spin_lock(&c->orphan_lock);
+ cnext = c->orph_cnext;
+ for (i = 0; i < cnt; i++) {
+ orphan = cnext;
+ orph->inos[i] = cpu_to_le64(orphan->inum);
+ cnext = orphan->cnext;
+ orphan->cnext = NULL;
+ }
+ c->orph_cnext = cnext;
+ c->cmt_orphans -= cnt;
+ spin_unlock(&c->orphan_lock);
+ if (c->cmt_orphans)
+ orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+ else
+ /* Mark the last node of the commit */
+ orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+ ubifs_assert(c->ohead_offs + len <= c->leb_size);
+ ubifs_assert(c->ohead_lnum >= c->orph_first);
+ ubifs_assert(c->ohead_lnum <= c->orph_last);
+ err = do_write_orph_node(c, len, atomic);
+ c->ohead_offs += ALIGN(len, c->min_io_size);
+ c->ohead_offs = ALIGN(c->ohead_offs, 8);
+ return err;
+}
+
+/**
+ * write_orph_nodes - write orph nodes until there are no more to commit
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function writes orph nodes for all the orphans to commit. On success,
+ * %0 is returned, otherwise a negative error code is returned.
+ */
+static int write_orph_nodes(struct ubifs_info *c, int atomic)
+{
+ int err;
+
+ while (c->cmt_orphans > 0) {
+ err = write_orph_node(c, atomic);
+ if (err)
+ return err;
+ }
+ if (atomic) {
+ int lnum;
+
+ /* Unmap any unused LEBs after consolidation */
+ lnum = c->ohead_lnum + 1;
+ for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/**
+ * consolidate - consolidate the orphan area.
+ * @c: UBIFS file-system description object
+ *
+ * This function enables consolidation by putting all the orphans into the list
+ * to commit. The list is in the order that the orphans were added, and the
+ * LEBs are written atomically in order, so at no time can orphans be lost by
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int consolidate(struct ubifs_info *c)
+{
+ int tot_avail = tot_avail_orphs(c), err = 0;
+
+ spin_lock(&c->orphan_lock);
+ dbg_cmt("there is space for %d orphans and there are %d",
+ tot_avail, c->tot_orphans);
+ if (c->tot_orphans - c->new_orphans <= tot_avail) {
+ struct ubifs_orphan *orphan, **last;
+ int cnt = 0;
+
+ /* Change the cnext list to include all non-new orphans */
+ last = &c->orph_cnext;
+ list_for_each_entry(orphan, &c->orph_list, list) {
+ if (orphan->new)
+ continue;
+ *last = orphan;
+ last = &orphan->cnext;
+ cnt += 1;
+ }
+ *last = orphan->cnext;
+ ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
+ c->cmt_orphans = cnt;
+ c->ohead_lnum = c->orph_first;
+ c->ohead_offs = 0;
+ } else {
+ /*
+ * We limit the number of orphans so that this should
+ * never happen.
+ */
+ ubifs_err("out of space in orphan area");
+ err = -EINVAL;
+ }
+ spin_unlock(&c->orphan_lock);
+ return err;
+}
+
+/**
+ * commit_orphans - commit orphans.
+ * @c: UBIFS file-system description object
+ *
+ * This function commits orphans to flash. On success, %0 is returned,
+ * otherwise a negative error code is returned.
+ */
+static int commit_orphans(struct ubifs_info *c)
+{
+ int avail, atomic = 0, err;
+
+ ubifs_assert(c->cmt_orphans > 0);
+ avail = avail_orphs(c);
+ if (avail < c->cmt_orphans) {
+ /* Not enough space to write new orphans, so consolidate */
+ err = consolidate(c);
+ if (err)
+ return err;
+ atomic = 1;
+ }
+ err = write_orph_nodes(c, atomic);
+ return err;
+}
+
+/**
+ * erase_deleted - erase the orphans marked for deletion.
+ * @c: UBIFS file-system description object
+ *
+ * During commit, the orphans being committed cannot be deleted, so they are
+ * marked for deletion and deleted by this function. Also, the recovery
+ * adds killed orphans to the deletion list, and therefore they are deleted
+ * here too.
+ */
+static void erase_deleted(struct ubifs_info *c)
+{
+ struct ubifs_orphan *orphan, *dnext;
+
+ spin_lock(&c->orphan_lock);
+ dnext = c->orph_dnext;
+ while (dnext) {
+ orphan = dnext;
+ dnext = orphan->dnext;
+ ubifs_assert(!orphan->new);
+ rb_erase(&orphan->rb, &c->orph_tree);
+ list_del(&orphan->list);
+ c->tot_orphans -= 1;
+ dbg_gen("deleting orphan ino %lu", orphan->inum);
+ kfree(orphan);
+ }
+ c->orph_dnext = NULL;
+ spin_unlock(&c->orphan_lock);
+}
+
+/**
+ * ubifs_orphan_end_commit - end commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * End commit of orphans.
+ */
+int ubifs_orphan_end_commit(struct ubifs_info *c)
+{
+ int err;
+
+ if (c->cmt_orphans != 0) {
+ err = commit_orphans(c);
+ if (err)
+ return err;
+ }
+ erase_deleted(c);
+ err = dbg_check_orphans(c);
+ return err;
+}
+
+/**
+ * clear_orphans - erase all LEBs used for orphans.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is not required, then the orphans from the previous session
+ * are not needed. This function locates the LEBs used to record
+ * orphans, and un-maps them.
+ */
+static int clear_orphans(struct ubifs_info *c)
+{
+ int lnum, err;
+
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ c->ohead_lnum = c->orph_first;
+ c->ohead_offs = 0;
+ return 0;
+}
+
+/**
+ * insert_dead_orphan - insert an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * This function is a helper to the 'do_kill_orphans()' function. The orphan
+ * must be kept until the next commit, so it is added to the rb-tree and the
+ * deletion list.
+ */
+static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *orphan, *o;
+ struct rb_node **p, *parent = NULL;
+
+ orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
+ if (!orphan)
+ return -ENOMEM;
+ orphan->inum = inum;
+
+ p = &c->orph_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = &(*p)->rb_left;
+ else if (inum > o->inum)
+ p = &(*p)->rb_right;
+ else {
+ /* Already added - no problem */
+ kfree(orphan);
+ return 0;
+ }
+ }
+ c->tot_orphans += 1;
+ rb_link_node(&orphan->rb, parent, p);
+ rb_insert_color(&orphan->rb, &c->orph_tree);
+ list_add_tail(&orphan->list, &c->orph_list);
+ orphan->dnext = c->orph_dnext;
+ c->orph_dnext = orphan;
+ dbg_mnt("ino %lu, new %d, tot %d",
+ inum, c->new_orphans, c->tot_orphans);
+ return 0;
+}
+
+/**
+ * do_kill_orphans - remove orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB
+ * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @outofdate: whether the LEB is out of date is returned here
+ * @last_flagged: whether the end orph node is encountered
+ *
+ * This function is a helper to the 'kill_orphans()' function. It goes through
+ * every orphan node in a LEB and for every inode number recorded, removes
+ * all keys for that inode from the TNC.
+ */
+static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ unsigned long long *last_cmt_no, int *outofdate,
+ int *last_flagged)
+{
+ struct ubifs_scan_node *snod;
+ struct ubifs_orph_node *orph;
+ unsigned long long cmt_no;
+ ino_t inum;
+ int i, n, err, first = 1;
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ if (snod->type != UBIFS_ORPH_NODE) {
+ ubifs_err("invalid node type %d in orphan area at "
+ "%d:%d", snod->type, sleb->lnum, snod->offs);
+ dbg_dump_node(c, snod->node);
+ return -EINVAL;
+ }
+
+ orph = snod->node;
+
+ /* Check commit number */
+ cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
+ /*
+ * The commit number on the master node may be less, because
+ * of a failed commit. If there are several failed commits in a
+ * row, the commit number written on orph nodes will continue to
+ * increase (because the commit number is adjusted here) even
+ * though the commit number on the master node stays the same
+ * because the master node has not been re-written.
+ */
+ if (cmt_no > c->cmt_no)
+ c->cmt_no = cmt_no;
+ if (cmt_no < *last_cmt_no && *last_flagged) {
+ /*
+ * The last orph node had a higher commit number and was
+ * flagged as the last written for that commit number.
+ * That makes this orph node, out of date.
+ */
+ if (!first) {
+ ubifs_err("out of order commit number %llu in "
+ "orphan node at %d:%d",
+ cmt_no, sleb->lnum, snod->offs);
+ dbg_dump_node(c, snod->node);
+ return -EINVAL;
+ }
+ dbg_rcvry("out of date LEB %d", sleb->lnum);
+ *outofdate = 1;
+ return 0;
+ }
+
+ if (first)
+ first = 0;
+
+ n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+ for (i = 0; i < n; i++) {
+ inum = le64_to_cpu(orph->inos[i]);
+ dbg_rcvry("deleting orphaned inode %lu", inum);
+ err = ubifs_tnc_remove_ino(c, inum);
+ if (err)
+ return err;
+ err = insert_dead_orphan(c, inum);
+ if (err)
+ return err;
+ }
+
+ *last_cmt_no = cmt_no;
+ if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
+ dbg_rcvry("last orph node for commit %llu at %d:%d",
+ cmt_no, sleb->lnum, snod->offs);
+ *last_flagged = 1;
+ } else
+ *last_flagged = 0;
+ }
+
+ return 0;
+}
+
+/**
+ * kill_orphans - remove all orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is required, then orphan inodes recorded during the previous
+ * session (which ended with an unclean unmount) must be deleted from the index.
+ * This is done by updating the TNC, but since the index is not updated until
+ * the next commit, the LEBs where the orphan information is recorded are not
+ * erased until the next commit.
+ */
+static int kill_orphans(struct ubifs_info *c)
+{
+ unsigned long long last_cmt_no = 0;
+ int lnum, err = 0, outofdate = 0, last_flagged = 0;
+
+ c->ohead_lnum = c->orph_first;
+ c->ohead_offs = 0;
+ /* Check no-orphans flag and skip this if no orphans */
+ if (c->no_orphs) {
+ dbg_rcvry("no orphans");
+ return 0;
+ }
+ /*
+ * Orph nodes always start at c->orph_first and are written to each
+ * successive LEB in turn. Generally unused LEBs will have been unmapped
+ * but may contain out of date orph nodes if the unmap didn't go
+ * through. In addition, the last orph node written for each commit is
+ * marked (top bit of orph->cmt_no is set to 1). It is possible that
+ * there are orph nodes from the next commit (i.e. the commit did not
+ * complete successfully). In that case, no orphans will have been lost
+ * due to the way that orphans are written, and any orphans added will
+ * be valid orphans anyway and so can be deleted.
+ */
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ struct ubifs_scan_leb *sleb;
+
+ dbg_rcvry("LEB %d", lnum);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb)) {
+ sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ if (IS_ERR(sleb)) {
+ err = PTR_ERR(sleb);
+ break;
+ }
+ }
+ err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
+ &last_flagged);
+ if (err || outofdate) {
+ ubifs_scan_destroy(sleb);
+ break;
+ }
+ if (sleb->endpt) {
+ c->ohead_lnum = lnum;
+ c->ohead_offs = sleb->endpt;
+ }
+ ubifs_scan_destroy(sleb);
+ }
+ return err;
+}
+
+/**
+ * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
+ * @c: UBIFS file-system description object
+ * @unclean: indicates recovery from unclean unmount
+ * @read_only: indicates read only mount
+ *
+ * This function is called when mounting to erase orphans from the previous
+ * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
+ * orphans are deleted.
+ */
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
+{
+ int err = 0;
+
+ c->max_orphans = tot_avail_orphs(c);
+
+ if (!read_only) {
+ c->orph_buf = vmalloc(c->leb_size);
+ if (!c->orph_buf)
+ return -ENOMEM;
+ }
+
+ if (unclean)
+ err = kill_orphans(c);
+ else if (!read_only)
+ err = clear_orphans(c);
+
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+struct check_orphan {
+ struct rb_node rb;
+ ino_t inum;
+};
+
+struct check_info {
+ unsigned long last_ino;
+ unsigned long tot_inos;
+ unsigned long missing;
+ unsigned long long leaf_cnt;
+ struct ubifs_ino_node *node;
+ struct rb_root root;
+};
+
+static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *o;
+ struct rb_node *p;
+
+ spin_lock(&c->orphan_lock);
+ p = c->orph_tree.rb_node;
+ while (p) {
+ o = rb_entry(p, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = p->rb_left;
+ else if (inum > o->inum)
+ p = p->rb_right;
+ else {
+ spin_unlock(&c->orphan_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&c->orphan_lock);
+ return 0;
+}
+
+static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
+{
+ struct check_orphan *orphan, *o;
+ struct rb_node **p, *parent = NULL;
+
+ orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
+ if (!orphan)
+ return -ENOMEM;
+ orphan->inum = inum;
+
+ p = &root->rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct check_orphan, rb);
+ if (inum < o->inum)
+ p = &(*p)->rb_left;
+ else if (inum > o->inum)
+ p = &(*p)->rb_right;
+ else {
+ kfree(orphan);
+ return 0;
+ }
+ }
+ rb_link_node(&orphan->rb, parent, p);
+ rb_insert_color(&orphan->rb, root);
+ return 0;
+}
+
+static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
+{
+ struct check_orphan *o;
+ struct rb_node *p;
+
+ p = root->rb_node;
+ while (p) {
+ o = rb_entry(p, struct check_orphan, rb);
+ if (inum < o->inum)
+ p = p->rb_left;
+ else if (inum > o->inum)
+ p = p->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+static void dbg_free_check_tree(struct rb_root *root)
+{
+ struct rb_node *this = root->rb_node;
+ struct check_orphan *o;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ o = rb_entry(this, struct check_orphan, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &o->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(o);
+ }
+}
+
+static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *priv)
+{
+ struct check_info *ci = priv;
+ ino_t inum;
+ int err;
+
+ inum = key_inum(c, &zbr->key);
+ if (inum != ci->last_ino) {
+ /* Lowest node type is the inode node, so it comes first */
+ if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+ ubifs_err("found orphan node ino %lu, type %d", inum,
+ key_type(c, &zbr->key));
+ ci->last_ino = inum;
+ ci->tot_inos += 1;
+ err = ubifs_tnc_read_node(c, zbr, ci->node);
+ if (err) {
+ ubifs_err("node read failed, error %d", err);
+ return err;
+ }
+ if (ci->node->nlink == 0)
+ /* Must be recorded as an orphan */
+ if (!dbg_find_check_orphan(&ci->root, inum) &&
+ !dbg_find_orphan(c, inum)) {
+ ubifs_err("missing orphan, ino %lu", inum);
+ ci->missing += 1;
+ }
+ }
+ ci->leaf_cnt += 1;
+ return 0;
+}
+
+static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
+{
+ struct ubifs_scan_node *snod;
+ struct ubifs_orph_node *orph;
+ ino_t inum;
+ int i, n, err;
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ cond_resched();
+ if (snod->type != UBIFS_ORPH_NODE)
+ continue;
+ orph = snod->node;
+ n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+ for (i = 0; i < n; i++) {
+ inum = le64_to_cpu(orph->inos[i]);
+ err = dbg_ins_check_orphan(&ci->root, inum);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
+{
+ int lnum, err = 0;
+
+ /* Check no-orphans flag and skip this if no orphans */
+ if (c->no_orphs)
+ return 0;
+
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ struct ubifs_scan_leb *sleb;
+
+ sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ if (IS_ERR(sleb)) {
+ err = PTR_ERR(sleb);
+ break;
+ }
+
+ err = dbg_read_orphans(ci, sleb);
+ ubifs_scan_destroy(sleb);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static int dbg_check_orphans(struct ubifs_info *c)
+{
+ struct check_info ci;
+ int err;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
+ return 0;
+
+ ci.last_ino = 0;
+ ci.tot_inos = 0;
+ ci.missing = 0;
+ ci.leaf_cnt = 0;
+ ci.root = RB_ROOT;
+ ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+ if (!ci.node) {
+ ubifs_err("out of memory");
+ return -ENOMEM;
+ }
+
+ err = dbg_scan_orphans(c, &ci);
+ if (err)
+ goto out;
+
+ err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
+ if (err) {
+ ubifs_err("cannot scan TNC, error %d", err);
+ goto out;
+ }
+
+ if (ci.missing) {
+ ubifs_err("%lu missing orphan(s)", ci.missing);
+ err = -EINVAL;
+ goto out;
+ }
+
+ dbg_cmt("last inode number is %lu", ci.last_ino);
+ dbg_cmt("total number of inodes is %lu", ci.tot_inos);
+ dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
+
+out:
+ dbg_free_check_tree(&ci.root);
+ kfree(ci.node);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 00000000000..77d26c141cf
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1519 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions needed to recover from unclean un-mounts.
+ * When UBIFS is mounted, it checks a flag on the master node to determine if
+ * an un-mount was completed sucessfully. If not, the process of mounting
+ * incorparates additional checking and fixing of on-flash data structures.
+ * UBIFS always cleans away all remnants of an unclean un-mount, so that
+ * errors do not accumulate. However UBIFS defers recovery if it is mounted
+ * read-only, and the flash is not modified in that case.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/**
+ * is_empty - determine whether a buffer is empty (contains all 0xff).
+ * @buf: buffer to clean
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
+ * %0 is returned.
+ */
+static int is_empty(void *buf, int len)
+{
+ uint8_t *p = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ if (*p++ != 0xff)
+ return 0;
+ return 1;
+}
+
+/**
+ * get_master_node - get the last valid master node allowing for corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @pbuf: buffer containing the LEB read, is returned here
+ * @mst: master node, if found, is returned here
+ * @cor: corruption, if found, is returned here
+ *
+ * This function allocates a buffer, reads the LEB into it, and finds and
+ * returns the last valid master node allowing for one area of corruption.
+ * The corrupt area, if there is one, must be consistent with the assumption
+ * that it is the result of an unclean unmount while the master node was being
+ * written. Under those circumstances, it is valid to use the previously written
+ * master node.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
+ struct ubifs_mst_node **mst, void **cor)
+{
+ const int sz = c->mst_node_alsz;
+ int err, offs, len;
+ void *sbuf, *buf;
+
+ sbuf = vmalloc(c->leb_size);
+ if (!sbuf)
+ return -ENOMEM;
+
+ err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
+ if (err && err != -EBADMSG)
+ goto out_free;
+
+ /* Find the first position that is definitely not a node */
+ offs = 0;
+ buf = sbuf;
+ len = c->leb_size;
+ while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
+ struct ubifs_ch *ch = buf;
+
+ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+ break;
+ offs += sz;
+ buf += sz;
+ len -= sz;
+ }
+ /* See if there was a valid master node before that */
+ if (offs) {
+ int ret;
+
+ offs -= sz;
+ buf -= sz;
+ len += sz;
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ if (ret != SCANNED_A_NODE && offs) {
+ /* Could have been corruption so check one place back */
+ offs -= sz;
+ buf -= sz;
+ len += sz;
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ if (ret != SCANNED_A_NODE)
+ /*
+ * We accept only one area of corruption because
+ * we are assuming that it was caused while
+ * trying to write a master node.
+ */
+ goto out_err;
+ }
+ if (ret == SCANNED_A_NODE) {
+ struct ubifs_ch *ch = buf;
+
+ if (ch->node_type != UBIFS_MST_NODE)
+ goto out_err;
+ dbg_rcvry("found a master node at %d:%d", lnum, offs);
+ *mst = buf;
+ offs += sz;
+ buf += sz;
+ len -= sz;
+ }
+ }
+ /* Check for corruption */
+ if (offs < c->leb_size) {
+ if (!is_empty(buf, min_t(int, len, sz))) {
+ *cor = buf;
+ dbg_rcvry("found corruption at %d:%d", lnum, offs);
+ }
+ offs += sz;
+ buf += sz;
+ len -= sz;
+ }
+ /* Check remaining empty space */
+ if (offs < c->leb_size)
+ if (!is_empty(buf, len))
+ goto out_err;
+ *pbuf = sbuf;
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out_free:
+ vfree(sbuf);
+ *mst = NULL;
+ *cor = NULL;
+ return err;
+}
+
+/**
+ * write_rcvrd_mst_node - write recovered master node.
+ * @c: UBIFS file-system description object
+ * @mst: master node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_rcvrd_mst_node(struct ubifs_info *c,
+ struct ubifs_mst_node *mst)
+{
+ int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
+ uint32_t save_flags;
+
+ dbg_rcvry("recovery");
+
+ save_flags = mst->flags;
+ mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
+
+ ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+ err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
+ if (err)
+ goto out;
+ err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
+ if (err)
+ goto out;
+out:
+ mst->flags = save_flags;
+ return err;
+}
+
+/**
+ * ubifs_recover_master_node - recover the master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function recovers the master node from corruption that may occur due to
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_master_node(struct ubifs_info *c)
+{
+ void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
+ struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
+ const int sz = c->mst_node_alsz;
+ int err, offs1, offs2;
+
+ dbg_rcvry("recovery");
+
+ err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
+ if (err)
+ goto out_free;
+
+ err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
+ if (err)
+ goto out_free;
+
+ if (mst1) {
+ offs1 = (void *)mst1 - buf1;
+ if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
+ (offs1 == 0 && !cor1)) {
+ /*
+ * mst1 was written by recovery at offset 0 with no
+ * corruption.
+ */
+ dbg_rcvry("recovery recovery");
+ mst = mst1;
+ } else if (mst2) {
+ offs2 = (void *)mst2 - buf2;
+ if (offs1 == offs2) {
+ /* Same offset, so must be the same */
+ if (memcmp((void *)mst1 + UBIFS_CH_SZ,
+ (void *)mst2 + UBIFS_CH_SZ,
+ UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+ goto out_err;
+ mst = mst1;
+ } else if (offs2 + sz == offs1) {
+ /* 1st LEB was written, 2nd was not */
+ if (cor1)
+ goto out_err;
+ mst = mst1;
+ } else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
+ /* 1st LEB was unmapped and written, 2nd not */
+ if (cor1)
+ goto out_err;
+ mst = mst1;
+ } else
+ goto out_err;
+ } else {
+ /*
+ * 2nd LEB was unmapped and about to be written, so
+ * there must be only one master node in the first LEB
+ * and no corruption.
+ */
+ if (offs1 != 0 || cor1)
+ goto out_err;
+ mst = mst1;
+ }
+ } else {
+ if (!mst2)
+ goto out_err;
+ /*
+ * 1st LEB was unmapped and about to be written, so there must
+ * be no room left in 2nd LEB.
+ */
+ offs2 = (void *)mst2 - buf2;
+ if (offs2 + sz + sz <= c->leb_size)
+ goto out_err;
+ mst = mst2;
+ }
+
+ dbg_rcvry("recovered master node from LEB %d",
+ (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
+
+ memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
+
+ if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+ /* Read-only mode. Keep a copy for switching to rw mode */
+ c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
+ if (!c->rcvrd_mst_node) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+ } else {
+ /* Write the recovered master node */
+ c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
+ err = write_rcvrd_mst_node(c, c->mst_node);
+ if (err)
+ goto out_free;
+ }
+
+ vfree(buf2);
+ vfree(buf1);
+
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out_free:
+ ubifs_err("failed to recover master node");
+ if (mst1) {
+ dbg_err("dumping first master node");
+ dbg_dump_node(c, mst1);
+ }
+ if (mst2) {
+ dbg_err("dumping second master node");
+ dbg_dump_node(c, mst2);
+ }
+ vfree(buf2);
+ vfree(buf1);
+ return err;
+}
+
+/**
+ * ubifs_write_rcvrd_mst_node - write the recovered master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node that was recovered during mounting in
+ * read-only mode and must now be written because we are remounting rw.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
+{
+ int err;
+
+ if (!c->rcvrd_mst_node)
+ return 0;
+ c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
+ if (err)
+ return err;
+ kfree(c->rcvrd_mst_node);
+ c->rcvrd_mst_node = NULL;
+ return 0;
+}
+
+/**
+ * is_last_write - determine if an offset was in the last write to a LEB.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @offs: offset to check
+ *
+ * This function returns %1 if @offs was in the last write to the LEB whose data
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
+ * for subsequent empty space starting from the next min_io_size boundary (or a
+ * bit less than the common header size if min_io_size is one).
+ */
+static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
+{
+ int empty_offs;
+ int check_len;
+ uint8_t *p;
+
+ if (c->min_io_size == 1) {
+ check_len = c->leb_size - offs;
+ p = buf + check_len;
+ for (; check_len > 0; check_len--)
+ if (*--p != 0xff)
+ break;
+ /*
+ * 'check_len' is the size of the corruption which cannot be
+ * more than the size of 1 node if it was caused by an unclean
+ * unmount.
+ */
+ if (check_len > UBIFS_MAX_NODE_SZ)
+ return 0;
+ return 1;
+ }
+
+ /*
+ * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+ * last wbuf written. After that should be empty space.
+ */
+ empty_offs = ALIGN(offs + 1, c->min_io_size);
+ check_len = c->leb_size - empty_offs;
+ p = buf + empty_offs - offs;
+
+ for (; check_len > 0; check_len--)
+ if (*p++ != 0xff)
+ return 0;
+ return 1;
+}
+
+/**
+ * clean_buf - clean the data from an LEB sitting in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to clean
+ * @lnum: LEB number to clean
+ * @offs: offset from which to clean
+ * @len: length of buffer
+ *
+ * This function pads up to the next min_io_size boundary (if there is one) and
+ * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
+ * min_io_size boundary (if there is one).
+ */
+static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
+ int *offs, int *len)
+{
+ int empty_offs, pad_len;
+
+ lnum = lnum;
+ dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
+
+ if (c->min_io_size == 1) {
+ memset(*buf, 0xff, c->leb_size - *offs);
+ return;
+ }
+
+ ubifs_assert(!(*offs & 7));
+ empty_offs = ALIGN(*offs, c->min_io_size);
+ pad_len = empty_offs - *offs;
+ ubifs_pad(c, *buf, pad_len);
+ *offs += pad_len;
+ *buf += pad_len;
+ *len -= pad_len;
+ memset(*buf, 0xff, c->leb_size - empty_offs);
+}
+
+/**
+ * no_more_nodes - determine if there are no more nodes in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @len: length of buffer
+ * @lnum: LEB number of the LEB from which @buf was read
+ * @offs: offset from which @buf was read
+ *
+ * This function scans @buf for more nodes and returns %0 is a node is found and
+ * %1 if no more nodes are found.
+ */
+static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
+ int lnum, int offs)
+{
+ int skip, next_offs = 0;
+
+ if (len > UBIFS_DATA_NODE_SZ) {
+ struct ubifs_ch *ch = buf;
+ int dlen = le32_to_cpu(ch->len);
+
+ if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+ dlen <= UBIFS_MAX_DATA_NODE_SZ)
+ /* The corrupt node looks like a data node */
+ next_offs = ALIGN(offs + dlen, 8);
+ }
+
+ if (c->min_io_size == 1)
+ skip = 8;
+ else
+ skip = ALIGN(offs + 1, c->min_io_size) - offs;
+
+ offs += skip;
+ buf += skip;
+ len -= skip;
+ while (len > 8) {
+ struct ubifs_ch *ch = buf;
+ uint32_t magic = le32_to_cpu(ch->magic);
+ int ret;
+
+ if (magic == UBIFS_NODE_MAGIC) {
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ if (ret == SCANNED_A_NODE || ret > 0) {
+ /*
+ * There is a small chance this is just data in
+ * a data node, so check that possibility. e.g.
+ * this is part of a file that itself contains
+ * a UBIFS image.
+ */
+ if (next_offs && offs + le32_to_cpu(ch->len) <=
+ next_offs)
+ continue;
+ dbg_rcvry("unexpected node at %d:%d", lnum,
+ offs);
+ return 0;
+ }
+ }
+ offs += 8;
+ buf += 8;
+ len -= 8;
+ }
+ return 1;
+}
+
+/**
+ * fix_unclean_leb - fix an unclean LEB.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB information
+ * @start: offset where scan started
+ */
+static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ int start)
+{
+ int lnum = sleb->lnum, endpt = start;
+
+ /* Get the end offset of the last node we are keeping */
+ if (!list_empty(&sleb->nodes)) {
+ struct ubifs_scan_node *snod;
+
+ snod = list_entry(sleb->nodes.prev,
+ struct ubifs_scan_node, list);
+ endpt = snod->offs + snod->len;
+ }
+
+ if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+ /* Add to recovery list */
+ struct ubifs_unclean_leb *ucleb;
+
+ dbg_rcvry("need to fix LEB %d start %d endpt %d",
+ lnum, start, sleb->endpt);
+ ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
+ if (!ucleb)
+ return -ENOMEM;
+ ucleb->lnum = lnum;
+ ucleb->endpt = endpt;
+ list_add_tail(&ucleb->list, &c->unclean_leb_list);
+ } else {
+ /* Write the fixed LEB back to flash */
+ int err;
+
+ dbg_rcvry("fixing LEB %d start %d endpt %d",
+ lnum, start, sleb->endpt);
+ if (endpt == 0) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ } else {
+ int len = ALIGN(endpt, c->min_io_size);
+
+ if (start) {
+ err = ubi_read(c->ubi, lnum, sleb->buf, 0,
+ start);
+ if (err)
+ return err;
+ }
+ /* Pad to min_io_size */
+ if (len > endpt) {
+ int pad_len = len - ALIGN(endpt, 8);
+
+ if (pad_len > 0) {
+ void *buf = sleb->buf + len - pad_len;
+
+ ubifs_pad(c, buf, pad_len);
+ }
+ }
+ err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
+ UBI_UNKNOWN);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/**
+ * drop_incomplete_group - drop nodes from an incomplete group.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This function returns %1 if nodes are dropped and %0 otherwise.
+ */
+static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+{
+ int dropped = 0;
+
+ while (!list_empty(&sleb->nodes)) {
+ struct ubifs_scan_node *snod;
+ struct ubifs_ch *ch;
+
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+ list);
+ ch = snod->node;
+ if (ch->group_type != UBIFS_IN_NODE_GROUP)
+ return dropped;
+ dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+ *offs = snod->offs;
+ list_del(&snod->list);
+ kfree(snod);
+ sleb->nodes_cnt -= 1;
+ dropped = 1;
+ }
+ return dropped;
+}
+
+/**
+ * ubifs_recover_leb - scan and recover a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ * @grouped: nodes may be grouped for recovery
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf, int grouped)
+{
+ int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+ int empty_chkd = 0, start = offs;
+ struct ubifs_scan_leb *sleb;
+ void *buf = sbuf + offs;
+
+ dbg_rcvry("%d:%d", lnum, offs);
+
+ sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb))
+ return sleb;
+
+ if (sleb->ecc)
+ need_clean = 1;
+
+ while (len >= 8) {
+ int ret;
+
+ dbg_scan("look at LEB %d:%d (%d bytes left)",
+ lnum, offs, len);
+
+ cond_resched();
+
+ /*
+ * Scan quietly until there is an error from which we cannot
+ * recover
+ */
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+ if (ret == SCANNED_A_NODE) {
+ /* A valid node, and not a padding node */
+ struct ubifs_ch *ch = buf;
+ int node_len;
+
+ err = ubifs_add_snod(c, sleb, buf, offs);
+ if (err)
+ goto error;
+ node_len = ALIGN(le32_to_cpu(ch->len), 8);
+ offs += node_len;
+ buf += node_len;
+ len -= node_len;
+ continue;
+ }
+
+ if (ret > 0) {
+ /* Padding bytes or a valid padding node */
+ offs += ret;
+ buf += ret;
+ len -= ret;
+ continue;
+ }
+
+ if (ret == SCANNED_EMPTY_SPACE) {
+ if (!is_empty(buf, len)) {
+ if (!is_last_write(c, buf, offs))
+ break;
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ }
+ empty_chkd = 1;
+ break;
+ }
+
+ if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
+ if (is_last_write(c, buf, offs)) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ empty_chkd = 1;
+ break;
+ }
+
+ if (ret == SCANNED_A_CORRUPT_NODE)
+ if (no_more_nodes(c, buf, len, lnum, offs)) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ empty_chkd = 1;
+ break;
+ }
+
+ if (quiet) {
+ /* Redo the last scan but noisily */
+ quiet = 0;
+ continue;
+ }
+
+ switch (ret) {
+ case SCANNED_GARBAGE:
+ dbg_err("garbage");
+ goto corrupted;
+ case SCANNED_A_CORRUPT_NODE:
+ case SCANNED_A_BAD_PAD_NODE:
+ dbg_err("bad node");
+ goto corrupted;
+ default:
+ dbg_err("unknown");
+ goto corrupted;
+ }
+ }
+
+ if (!empty_chkd && !is_empty(buf, len)) {
+ if (is_last_write(c, buf, offs)) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ } else {
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
+ goto corrupted;
+ }
+ }
+
+ /* Drop nodes from incomplete group */
+ if (grouped && drop_incomplete_group(sleb, &offs)) {
+ buf = sbuf + offs;
+ len = c->leb_size - offs;
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ }
+
+ if (offs % c->min_io_size) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ }
+
+ ubifs_end_scan(c, sleb, lnum, offs);
+
+ if (need_clean) {
+ err = fix_unclean_leb(c, sleb, start);
+ if (err)
+ goto error;
+ }
+
+ return sleb;
+
+corrupted:
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ err = -EUCLEAN;
+error:
+ ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+}
+
+/**
+ * get_cs_sqnum - get commit start sequence number.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of commit start node
+ * @offs: offset of commit start node
+ * @cs_sqnum: commit start sequence number is returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
+ unsigned long long *cs_sqnum)
+{
+ struct ubifs_cs_node *cs_node = NULL;
+ int err, ret;
+
+ dbg_rcvry("at %d:%d", lnum, offs);
+ cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
+ if (!cs_node)
+ return -ENOMEM;
+ if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
+ goto out_err;
+ err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
+ if (err && err != -EBADMSG)
+ goto out_free;
+ ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
+ if (ret != SCANNED_A_NODE) {
+ dbg_err("Not a valid node");
+ goto out_err;
+ }
+ if (cs_node->ch.node_type != UBIFS_CS_NODE) {
+ dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+ goto out_err;
+ }
+ if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
+ dbg_err("CS node cmt_no %llu != current cmt_no %llu",
+ (unsigned long long)le64_to_cpu(cs_node->cmt_no),
+ c->cmt_no);
+ goto out_err;
+ }
+ *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
+ dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
+ kfree(cs_node);
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out_free:
+ ubifs_err("failed to get CS sqnum");
+ kfree(cs_node);
+ return err;
+}
+
+/**
+ * ubifs_recover_log_leb - scan and recover a log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf)
+{
+ struct ubifs_scan_leb *sleb;
+ int next_lnum;
+
+ dbg_rcvry("LEB %d", lnum);
+ next_lnum = lnum + 1;
+ if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+ next_lnum = UBIFS_LOG_LNUM;
+ if (next_lnum != c->ltail_lnum) {
+ /*
+ * We can only recover at the end of the log, so check that the
+ * next log LEB is empty or out of date.
+ */
+ sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+ if (IS_ERR(sleb))
+ return sleb;
+ if (sleb->nodes_cnt) {
+ struct ubifs_scan_node *snod;
+ unsigned long long cs_sqnum = c->cs_sqnum;
+
+ snod = list_entry(sleb->nodes.next,
+ struct ubifs_scan_node, list);
+ if (cs_sqnum == 0) {
+ int err;
+
+ err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
+ if (err) {
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+ }
+ }
+ if (snod->sqnum > cs_sqnum) {
+ ubifs_err("unrecoverable log corruption "
+ "in LEB %d", lnum);
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(-EUCLEAN);
+ }
+ }
+ ubifs_scan_destroy(sleb);
+ }
+ return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+}
+
+/**
+ * recover_head - recover a head.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of head to recover
+ * @offs: offset of head to recover
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at a head location.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int recover_head(const struct ubifs_info *c, int lnum, int offs,
+ void *sbuf)
+{
+ int len, err, need_clean = 0;
+
+ if (c->min_io_size > 1)
+ len = c->min_io_size;
+ else
+ len = 512;
+ if (offs + len > c->leb_size)
+ len = c->leb_size - offs;
+
+ if (!len)
+ return 0;
+
+ /* Read at the head location and check it is empty flash */
+ err = ubi_read(c->ubi, lnum, sbuf, offs, len);
+ if (err)
+ need_clean = 1;
+ else {
+ uint8_t *p = sbuf;
+
+ while (len--)
+ if (*p++ != 0xff) {
+ need_clean = 1;
+ break;
+ }
+ }
+
+ if (need_clean) {
+ dbg_rcvry("cleaning head at %d:%d", lnum, offs);
+ if (offs == 0)
+ return ubifs_leb_unmap(c, lnum);
+ err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
+ if (err)
+ return err;
+ return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_recover_inl_heads - recover index and LPT heads.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at the index and
+ * LPT head locations.
+ *
+ * This deals with the recovery of a half-completed journal commit. UBIFS is
+ * careful never to overwrite the last version of the index or the LPT. Because
+ * the index and LPT are wandering trees, data from a half-completed commit will
+ * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
+ * assumed to be empty and will be unmapped anyway before use, or in the index
+ * and LPT heads.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
+{
+ int err;
+
+ ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+
+ dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
+ err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
+ if (err)
+ return err;
+
+ dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
+ err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * @c: UBIFS file-system description object
+ * @ucleb: unclean LEB information
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function reads a LEB up to a point pre-determined by the mount recovery,
+ * checks the nodes, and writes the result back to the flash, thereby cleaning
+ * off any following corruption, or non-fatal ECC errors.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int clean_an_unclean_leb(const struct ubifs_info *c,
+ struct ubifs_unclean_leb *ucleb, void *sbuf)
+{
+ int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
+ void *buf = sbuf;
+
+ dbg_rcvry("LEB %d len %d", lnum, len);
+
+ if (len == 0) {
+ /* Nothing to read, just unmap it */
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ return 0;
+ }
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ if (err && err != -EBADMSG)
+ return err;
+
+ while (len >= 8) {
+ int ret;
+
+ cond_resched();
+
+ /* Scan quietly until there is an error */
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+ if (ret == SCANNED_A_NODE) {
+ /* A valid node, and not a padding node */
+ struct ubifs_ch *ch = buf;
+ int node_len;
+
+ node_len = ALIGN(le32_to_cpu(ch->len), 8);
+ offs += node_len;
+ buf += node_len;
+ len -= node_len;
+ continue;
+ }
+
+ if (ret > 0) {
+ /* Padding bytes or a valid padding node */
+ offs += ret;
+ buf += ret;
+ len -= ret;
+ continue;
+ }
+
+ if (ret == SCANNED_EMPTY_SPACE) {
+ ubifs_err("unexpected empty space at %d:%d",
+ lnum, offs);
+ return -EUCLEAN;
+ }
+
+ if (quiet) {
+ /* Redo the last scan but noisily */
+ quiet = 0;
+ continue;
+ }
+
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ return -EUCLEAN;
+ }
+
+ /* Pad to min_io_size */
+ len = ALIGN(ucleb->endpt, c->min_io_size);
+ if (len > ucleb->endpt) {
+ int pad_len = len - ALIGN(ucleb->endpt, 8);
+
+ if (pad_len > 0) {
+ buf = c->sbuf + len - pad_len;
+ ubifs_pad(c, buf, pad_len);
+ }
+ }
+
+ /* Write back the LEB atomically */
+ err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
+ if (err)
+ return err;
+
+ dbg_rcvry("cleaned LEB %d", lnum);
+
+ return 0;
+}
+
+/**
+ * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function cleans a LEB identified during recovery that needs to be
+ * written but was not because UBIFS was mounted read-only. This happens when
+ * remounting to read-write mode.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
+{
+ dbg_rcvry("recovery");
+ while (!list_empty(&c->unclean_leb_list)) {
+ struct ubifs_unclean_leb *ucleb;
+ int err;
+
+ ucleb = list_entry(c->unclean_leb_list.next,
+ struct ubifs_unclean_leb, list);
+ err = clean_an_unclean_leb(c, ucleb, sbuf);
+ if (err)
+ return err;
+ list_del(&ucleb->list);
+ kfree(ucleb);
+ }
+ return 0;
+}
+
+/**
+ * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
+ * @c: UBIFS file-system description object
+ *
+ * Out-of-place garbage collection requires always one empty LEB with which to
+ * start garbage collection. The LEB number is recorded in c->gc_lnum and is
+ * written to the master node on unmounting. In the case of an unclean unmount
+ * the value of gc_lnum recorded in the master node is out of date and cannot
+ * be used. Instead, recovery must allocate an empty LEB for this purpose.
+ * However, there may not be enough empty space, in which case it must be
+ * possible to GC the dirtiest LEB into the GC head LEB.
+ *
+ * This function also runs the commit which causes the TNC updates from
+ * size-recovery and orphans to be written to the flash. That is important to
+ * ensure correct replay order for subsequent mounts.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_rcvry_gc_commit(struct ubifs_info *c)
+{
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+ struct ubifs_lprops lp;
+ int lnum, err;
+
+ c->gc_lnum = -1;
+ if (wbuf->lnum == -1) {
+ dbg_rcvry("no GC head LEB");
+ goto find_free;
+ }
+ /*
+ * See whether the used space in the dirtiest LEB fits in the GC head
+ * LEB.
+ */
+ if (wbuf->offs == c->leb_size) {
+ dbg_rcvry("no room in GC head LEB");
+ goto find_free;
+ }
+ err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
+ if (err) {
+ if (err == -ENOSPC)
+ dbg_err("could not find a dirty LEB");
+ return err;
+ }
+ ubifs_assert(!(lp.flags & LPROPS_INDEX));
+ lnum = lp.lnum;
+ if (lp.free + lp.dirty == c->leb_size) {
+ /* An empty LEB was returned */
+ if (lp.free != c->leb_size) {
+ err = ubifs_change_one_lp(c, lnum, c->leb_size,
+ 0, 0, 0, 0);
+ if (err)
+ return err;
+ }
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ c->gc_lnum = lnum;
+ dbg_rcvry("allocated LEB %d for GC", lnum);
+ /* Run the commit */
+ dbg_rcvry("committing");
+ return ubifs_run_commit(c);
+ }
+ /*
+ * There was no empty LEB so the used space in the dirtiest LEB must fit
+ * in the GC head LEB.
+ */
+ if (lp.free + lp.dirty < wbuf->offs) {
+ dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
+ lnum, wbuf->lnum, wbuf->offs);
+ err = ubifs_return_leb(c, lnum);
+ if (err)
+ return err;
+ goto find_free;
+ }
+ /*
+ * We run the commit before garbage collection otherwise subsequent
+ * mounts will see the GC and orphan deletion in a different order.
+ */
+ dbg_rcvry("committing");
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ /*
+ * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+ * - use locking to keep 'ubifs_assert()' happy.
+ */
+ dbg_rcvry("GC'ing LEB %d", lnum);
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ err = ubifs_garbage_collect_leb(c, &lp);
+ if (err >= 0) {
+ int err2 = ubifs_wbuf_sync_nolock(wbuf);
+
+ if (err2)
+ err = err2;
+ }
+ mutex_unlock(&wbuf->io_mutex);
+ if (err < 0) {
+ dbg_err("GC failed, error %d", err);
+ if (err == -EAGAIN)
+ err = -EINVAL;
+ return err;
+ }
+ if (err != LEB_RETAINED) {
+ dbg_err("GC returned %d", err);
+ return -EINVAL;
+ }
+ err = ubifs_leb_unmap(c, c->gc_lnum);
+ if (err)
+ return err;
+ dbg_rcvry("allocated LEB %d for GC", lnum);
+ return 0;
+
+find_free:
+ /*
+ * There is no GC head LEB or the free space in the GC head LEB is too
+ * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
+ * GC is not run.
+ */
+ lnum = ubifs_find_free_leb_for_idx(c);
+ if (lnum < 0) {
+ dbg_err("could not find an empty LEB");
+ return lnum;
+ }
+ /* And reset the index flag */
+ err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_INDEX, 0);
+ if (err)
+ return err;
+ c->gc_lnum = lnum;
+ dbg_rcvry("allocated LEB %d for GC", lnum);
+ /* Run the commit */
+ dbg_rcvry("committing");
+ return ubifs_run_commit(c);
+}
+
+/**
+ * struct size_entry - inode size information for recovery.
+ * @rb: link in the RB-tree of sizes
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ * @inode: inode if pinned in memory awaiting rw mode to fix it
+ */
+struct size_entry {
+ struct rb_node rb;
+ ino_t inum;
+ loff_t i_size;
+ loff_t d_size;
+ int exists;
+ struct inode *inode;
+};
+
+/**
+ * add_ino - add an entry to the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ */
+static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
+ loff_t d_size, int exists)
+{
+ struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
+ struct size_entry *e;
+
+ while (*p) {
+ parent = *p;
+ e = rb_entry(parent, struct size_entry, rb);
+ if (inum < e->inum)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ e->inum = inum;
+ e->i_size = i_size;
+ e->d_size = d_size;
+ e->exists = exists;
+
+ rb_link_node(&e->rb, parent, p);
+ rb_insert_color(&e->rb, &c->size_tree);
+
+ return 0;
+}
+
+/**
+ * find_ino - find an entry on the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
+{
+ struct rb_node *p = c->size_tree.rb_node;
+ struct size_entry *e;
+
+ while (p) {
+ e = rb_entry(p, struct size_entry, rb);
+ if (inum < e->inum)
+ p = p->rb_left;
+ else if (inum > e->inum)
+ p = p->rb_right;
+ else
+ return e;
+ }
+ return NULL;
+}
+
+/**
+ * remove_ino - remove an entry from the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static void remove_ino(struct ubifs_info *c, ino_t inum)
+{
+ struct size_entry *e = find_ino(c, inum);
+
+ if (!e)
+ return;
+ rb_erase(&e->rb, &c->size_tree);
+ kfree(e);
+}
+
+/**
+ * ubifs_destroy_size_tree - free resources related to the size tree.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_destroy_size_tree(struct ubifs_info *c)
+{
+ struct rb_node *this = c->size_tree.rb_node;
+ struct size_entry *e;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ e = rb_entry(this, struct size_entry, rb);
+ if (e->inode)
+ iput(e->inode);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &e->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(e);
+ }
+ c->size_tree = RB_ROOT;
+}
+
+/**
+ * ubifs_recover_size_accum - accumulate inode sizes for recovery.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @deletion: node is for a deletion
+ * @new_size: inode size
+ *
+ * This function has two purposes:
+ * 1) to ensure there are no data nodes that fall outside the inode size
+ * 2) to ensure there are no data nodes for inodes that do not exist
+ * To accomplish those purposes, a rb-tree is constructed containing an entry
+ * for each inode number in the journal that has not been deleted, and recording
+ * the size from the inode node, the maximum size of any data node (also altered
+ * by truncations) and a flag indicating a inode number for which no inode node
+ * was present in the journal.
+ *
+ * Note that there is still the possibility that there are data nodes that have
+ * been committed that are beyond the inode size, however the only way to find
+ * them would be to scan the entire index. Alternatively, some provision could
+ * be made to record the size of inodes at the start of commit, which would seem
+ * very cumbersome for a scenario that is quite unlikely and the only negative
+ * consequence of which is wasted space.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+ int deletion, loff_t new_size)
+{
+ ino_t inum = key_inum(c, key);
+ struct size_entry *e;
+ int err;
+
+ switch (key_type(c, key)) {
+ case UBIFS_INO_KEY:
+ if (deletion)
+ remove_ino(c, inum);
+ else {
+ e = find_ino(c, inum);
+ if (e) {
+ e->i_size = new_size;
+ e->exists = 1;
+ } else {
+ err = add_ino(c, inum, new_size, 0, 1);
+ if (err)
+ return err;
+ }
+ }
+ break;
+ case UBIFS_DATA_KEY:
+ e = find_ino(c, inum);
+ if (e) {
+ if (new_size > e->d_size)
+ e->d_size = new_size;
+ } else {
+ err = add_ino(c, inum, 0, new_size, 0);
+ if (err)
+ return err;
+ }
+ break;
+ case UBIFS_TRUN_KEY:
+ e = find_ino(c, inum);
+ if (e)
+ e->d_size = new_size;
+ break;
+ }
+ return 0;
+}
+
+/**
+ * fix_size_in_place - fix inode size in place on flash.
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
+{
+ struct ubifs_ino_node *ino = c->sbuf;
+ unsigned char *p;
+ union ubifs_key key;
+ int err, lnum, offs, len;
+ loff_t i_size;
+ uint32_t crc;
+
+ /* Locate the inode node LEB number and offset */
+ ino_key_init(c, &key, e->inum);
+ err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
+ if (err)
+ goto out;
+ /*
+ * If the size recorded on the inode node is greater than the size that
+ * was calculated from nodes in the journal then don't change the inode.
+ */
+ i_size = le64_to_cpu(ino->size);
+ if (i_size >= e->d_size)
+ return 0;
+ /* Read the LEB */
+ err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
+ if (err)
+ goto out;
+ /* Change the size field and recalculate the CRC */
+ ino = c->sbuf + offs;
+ ino->size = cpu_to_le64(e->d_size);
+ len = le32_to_cpu(ino->ch.len);
+ crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
+ ino->ch.crc = cpu_to_le32(crc);
+ /* Work out where data in the LEB ends and free space begins */
+ p = c->sbuf;
+ len = c->leb_size - 1;
+ while (p[len] == 0xff)
+ len -= 1;
+ len = ALIGN(len + 1, c->min_io_size);
+ /* Atomically write the fixed LEB back again */
+ err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+ if (err)
+ goto out;
+ dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
+ i_size, e->d_size);
+ return 0;
+
+out:
+ ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
+ e->inum, e->i_size, e->d_size, err);
+ return err;
+}
+
+/**
+ * ubifs_recover_size - recover inode size.
+ * @c: UBIFS file-system description object
+ *
+ * This function attempts to fix inode size discrepancies identified by the
+ * 'ubifs_recover_size_accum()' function.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size(struct ubifs_info *c)
+{
+ struct rb_node *this = rb_first(&c->size_tree);
+
+ while (this) {
+ struct size_entry *e;
+ int err;
+
+ e = rb_entry(this, struct size_entry, rb);
+ if (!e->exists) {
+ union ubifs_key key;
+
+ ino_key_init(c, &key, e->inum);
+ err = ubifs_tnc_lookup(c, &key, c->sbuf);
+ if (err && err != -ENOENT)
+ return err;
+ if (err == -ENOENT) {
+ /* Remove data nodes that have no inode */
+ dbg_rcvry("removing ino %lu", e->inum);
+ err = ubifs_tnc_remove_ino(c, e->inum);
+ if (err)
+ return err;
+ } else {
+ struct ubifs_ino_node *ino = c->sbuf;
+
+ e->exists = 1;
+ e->i_size = le64_to_cpu(ino->size);
+ }
+ }
+ if (e->exists && e->i_size < e->d_size) {
+ if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+ /* Fix the inode size and pin it in memory */
+ struct inode *inode;
+
+ inode = ubifs_iget(c->vfs_sb, e->inum);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ if (inode->i_size < e->d_size) {
+ dbg_rcvry("ino %lu size %lld -> %lld",
+ e->inum, e->d_size,
+ inode->i_size);
+ inode->i_size = e->d_size;
+ ubifs_inode(inode)->ui_size = e->d_size;
+ e->inode = inode;
+ this = rb_next(this);
+ continue;
+ }
+ iput(inode);
+ } else {
+ /* Fix the size in place */
+ err = fix_size_in_place(c, e);
+ if (err)
+ return err;
+ if (e->inode)
+ iput(e->inode);
+ }
+ }
+ this = rb_next(this);
+ rb_erase(&e->rb, &c->size_tree);
+ kfree(e);
+ }
+ return 0;
+}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 00000000000..7399692af85
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1075 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains journal replay code. It runs when the file-system is being
+ * mounted and requires no locking.
+ *
+ * The larger is the journal, the longer it takes to scan it, so the longer it
+ * takes to mount UBIFS. This is why the journal has limited size which may be
+ * changed depending on the system requirements. But a larger journal gives
+ * faster I/O speed because it writes the index less frequently. So this is a
+ * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
+ * larger is the journal, the more memory its index may consume.
+ */
+
+#include "ubifs.h"
+
+/*
+ * Replay flags.
+ *
+ * REPLAY_DELETION: node was deleted
+ * REPLAY_REF: node is a reference node
+ */
+enum {
+ REPLAY_DELETION = 1,
+ REPLAY_REF = 2,
+};
+
+/**
+ * struct replay_entry - replay tree entry.
+ * @lnum: logical eraseblock number of the node
+ * @offs: node offset
+ * @len: node length
+ * @sqnum: node sequence number
+ * @flags: replay flags
+ * @rb: links the replay tree
+ * @key: node key
+ * @nm: directory entry name
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ * @free: amount of free space in a bud
+ * @dirty: amount of dirty space in a bud from padding and deletion nodes
+ *
+ * UBIFS journal replay must compare node sequence numbers, which means it must
+ * build a tree of node information to insert into the TNC.
+ */
+struct replay_entry {
+ int lnum;
+ int offs;
+ int len;
+ unsigned long long sqnum;
+ int flags;
+ struct rb_node rb;
+ union ubifs_key key;
+ union {
+ struct qstr nm;
+ struct {
+ loff_t old_size;
+ loff_t new_size;
+ };
+ struct {
+ int free;
+ int dirty;
+ };
+ };
+};
+
+/**
+ * struct bud_entry - entry in the list of buds to replay.
+ * @list: next bud in the list
+ * @bud: bud description object
+ * @free: free bytes in the bud
+ * @sqnum: reference node sequence number
+ */
+struct bud_entry {
+ struct list_head list;
+ struct ubifs_bud *bud;
+ int free;
+ unsigned long long sqnum;
+};
+
+/**
+ * set_bud_lprops - set free and dirty space used by a bud.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of bud
+ */
+static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+{
+ const struct ubifs_lprops *lp;
+ int err = 0, dirty;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ dirty = lp->dirty;
+ if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+ /*
+ * The LEB was added to the journal with a starting offset of
+ * zero which means the LEB must have been empty. The LEB
+ * property values should be lp->free == c->leb_size and
+ * lp->dirty == 0, but that is not the case. The reason is that
+ * the LEB was garbage collected. The garbage collector resets
+ * the free and dirty space without recording it anywhere except
+ * lprops, so if there is not a commit then lprops does not have
+ * that information next time the file system is mounted.
+ *
+ * We do not need to adjust free space because the scan has told
+ * us the exact value which is recorded in the replay entry as
+ * r->free.
+ *
+ * However we do need to subtract from the dirty space the
+ * amount of space that the garbage collector reclaimed, which
+ * is the whole LEB minus the amount of space that was free.
+ */
+ dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ lp->free, lp->dirty);
+ dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ lp->free, lp->dirty);
+ dirty -= c->leb_size - lp->free;
+ /*
+ * If the replay order was perfect the dirty space would now be
+ * zero. The order is not perfect because the the journal heads
+ * race with eachother. This is not a problem but is does mean
+ * that the dirty space may temporarily exceed c->leb_size
+ * during the replay.
+ */
+ if (dirty != 0)
+ dbg_msg("LEB %d lp: %d free %d dirty "
+ "replay: %d free %d dirty", r->lnum, lp->free,
+ lp->dirty, r->free, r->dirty);
+ }
+ lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * trun_remove_range - apply a replay entry for a truncation to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of truncation
+ */
+static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
+{
+ unsigned min_blk, max_blk;
+ union ubifs_key min_key, max_key;
+ ino_t ino;
+
+ min_blk = r->new_size / UBIFS_BLOCK_SIZE;
+ if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
+ min_blk += 1;
+
+ max_blk = r->old_size / UBIFS_BLOCK_SIZE;
+ if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
+ max_blk -= 1;
+
+ ino = key_inum(c, &r->key);
+
+ data_key_init(c, &min_key, ino, min_blk);
+ data_key_init(c, &max_key, ino, max_blk);
+
+ return ubifs_tnc_remove_range(c, &min_key, &max_key);
+}
+
+/**
+ * apply_replay_entry - apply a replay entry to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry to apply
+ *
+ * Apply a replay entry to the TNC.
+ */
+static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
+{
+ int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+
+ dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+ r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+
+ /* Set c->replay_sqnum to help deal with dangling branches. */
+ c->replay_sqnum = r->sqnum;
+
+ if (r->flags & REPLAY_REF)
+ err = set_bud_lprops(c, r);
+ else if (is_hash_key(c, &r->key)) {
+ if (deletion)
+ err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
+ else
+ err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
+ r->len, &r->nm);
+ } else {
+ if (deletion)
+ switch (key_type(c, &r->key)) {
+ case UBIFS_INO_KEY:
+ {
+ ino_t inum = key_inum(c, &r->key);
+
+ err = ubifs_tnc_remove_ino(c, inum);
+ break;
+ }
+ case UBIFS_TRUN_KEY:
+ err = trun_remove_range(c, r);
+ break;
+ default:
+ err = ubifs_tnc_remove(c, &r->key);
+ break;
+ }
+ else
+ err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
+ r->len);
+ if (err)
+ return err;
+
+ if (c->need_recovery)
+ err = ubifs_recover_size_accum(c, &r->key, deletion,
+ r->new_size);
+ }
+
+ return err;
+}
+
+/**
+ * destroy_replay_tree - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay tree.
+ */
+static void destroy_replay_tree(struct ubifs_info *c)
+{
+ struct rb_node *this = c->replay_tree.rb_node;
+ struct replay_entry *r;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ r = rb_entry(this, struct replay_entry, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &r->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ if (is_hash_key(c, &r->key))
+ kfree(r->nm.name);
+ kfree(r);
+ }
+ c->replay_tree = RB_ROOT;
+}
+
+/**
+ * apply_replay_tree - apply the replay tree to the TNC.
+ * @c: UBIFS file-system description object
+ *
+ * Apply the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int apply_replay_tree(struct ubifs_info *c)
+{
+ struct rb_node *this = rb_first(&c->replay_tree);
+
+ while (this) {
+ struct replay_entry *r;
+ int err;
+
+ cond_resched();
+
+ r = rb_entry(this, struct replay_entry, rb);
+ err = apply_replay_entry(c, r);
+ if (err)
+ return err;
+ this = rb_next(this);
+ }
+ return 0;
+}
+
+/**
+ * insert_node - insert a node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * This function inserts a scanned non-direntry node to the replay tree. The
+ * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * indexed by the sequence number. The replay tree is applied at the very end
+ * of the replay process. Since the tree is sorted in sequence number order,
+ * the older modifications are applied first. This function returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
+ union ubifs_key *key, unsigned long long sqnum,
+ int deletion, int *used, loff_t old_size,
+ loff_t new_size)
+{
+ struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+ struct replay_entry *r;
+
+ if (key_inum(c, key) >= c->highest_inum)
+ c->highest_inum = key_inum(c, key);
+
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct replay_entry, rb);
+ if (sqnum < r->sqnum) {
+ p = &(*p)->rb_left;
+ continue;
+ } else if (sqnum > r->sqnum) {
+ p = &(*p)->rb_right;
+ continue;
+ }
+ ubifs_err("duplicate sqnum in replay");
+ return -EINVAL;
+ }
+
+ r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ if (!deletion)
+ *used += ALIGN(len, 8);
+ r->lnum = lnum;
+ r->offs = offs;
+ r->len = len;
+ r->sqnum = sqnum;
+ r->flags = (deletion ? REPLAY_DELETION : 0);
+ r->old_size = old_size;
+ r->new_size = new_size;
+ key_copy(c, key, &r->key);
+
+ rb_link_node(&r->rb, parent, p);
+ rb_insert_color(&r->rb, &c->replay_tree);
+ return 0;
+}
+
+/**
+ * insert_dent - insert a directory entry node into the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @name: directory entry name
+ * @nlen: directory entry name length
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ *
+ * This function inserts a scanned directory entry node to the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * This function is also used for extended attribute entries because they are
+ * implemented as directory entry nodes.
+ */
+static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
+ union ubifs_key *key, const char *name, int nlen,
+ unsigned long long sqnum, int deletion, int *used)
+{
+ struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+ struct replay_entry *r;
+ char *nbuf;
+
+ if (key_inum(c, key) >= c->highest_inum)
+ c->highest_inum = key_inum(c, key);
+
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct replay_entry, rb);
+ if (sqnum < r->sqnum) {
+ p = &(*p)->rb_left;
+ continue;
+ }
+ if (sqnum > r->sqnum) {
+ p = &(*p)->rb_right;
+ continue;
+ }
+ ubifs_err("duplicate sqnum in replay");
+ return -EINVAL;
+ }
+
+ r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+ nbuf = kmalloc(nlen + 1, GFP_KERNEL);
+ if (!nbuf) {
+ kfree(r);
+ return -ENOMEM;
+ }
+
+ if (!deletion)
+ *used += ALIGN(len, 8);
+ r->lnum = lnum;
+ r->offs = offs;
+ r->len = len;
+ r->sqnum = sqnum;
+ r->nm.len = nlen;
+ memcpy(nbuf, name, nlen);
+ nbuf[nlen] = '\0';
+ r->nm.name = nbuf;
+ r->flags = (deletion ? REPLAY_DELETION : 0);
+ key_copy(c, key, &r->key);
+
+ ubifs_assert(!*p);
+ rb_link_node(&r->rb, parent, p);
+ rb_insert_color(&r->rb, &c->replay_tree);
+ return 0;
+}
+
+/**
+ * ubifs_validate_entry - validate directory or extended attribute entry node.
+ * @c: UBIFS file-system description object
+ * @dent: the node to validate
+ *
+ * This function validates directory or extended attribute entry node @dent.
+ * Returns zero if the node is all right and a %-EINVAL if not.
+ */
+int ubifs_validate_entry(struct ubifs_info *c,
+ const struct ubifs_dent_node *dent)
+{
+ int key_type = key_type_flash(c, dent->key);
+ int nlen = le16_to_cpu(dent->nlen);
+
+ if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
+ dent->type >= UBIFS_ITYPES_CNT ||
+ nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
+ strnlen(dent->name, nlen) != nlen ||
+ le64_to_cpu(dent->inum) > MAX_INUM) {
+ ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
+ "directory entry" : "extended attribute entry");
+ return -EINVAL;
+ }
+
+ if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
+ ubifs_err("bad key type %d", key_type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * replay_bud - replay a bud logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @free: amount of free space in the bud is returned here
+ * @dirty: amount of dirty space from padding and deletion nodes is returned
+ * here
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+ int *free, int *dirty)
+{
+ int err = 0, used = 0;
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct ubifs_bud *bud;
+
+ dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+ if (c->need_recovery)
+ sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+ else
+ sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+
+ /*
+ * The bud does not have to start from offset zero - the beginning of
+ * the 'lnum' LEB may contain previously committed data. One of the
+ * things we have to do in replay is to correctly update lprops with
+ * newer information about this LEB.
+ *
+ * At this point lprops thinks that this LEB has 'c->leb_size - offs'
+ * bytes of free space because it only contain information about
+ * committed data.
+ *
+ * But we know that real amount of free space is 'c->leb_size -
+ * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
+ * 'sleb->endpt' is used by bud data. We have to correctly calculate
+ * how much of these data are dirty and update lprops with this
+ * information.
+ *
+ * The dirt in that LEB region is comprised of padding nodes, deletion
+ * nodes, truncation nodes and nodes which are obsoleted by subsequent
+ * nodes in this LEB. So instead of calculating clean space, we
+ * calculate used space ('used' variable).
+ */
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ int deletion = 0;
+
+ cond_resched();
+
+ if (snod->sqnum >= SQNUM_WATERMARK) {
+ ubifs_err("file system's life ended");
+ goto out_dump;
+ }
+
+ if (snod->sqnum > c->max_sqnum)
+ c->max_sqnum = snod->sqnum;
+
+ switch (snod->type) {
+ case UBIFS_INO_NODE:
+ {
+ struct ubifs_ino_node *ino = snod->node;
+ loff_t new_size = le64_to_cpu(ino->size);
+
+ if (le32_to_cpu(ino->nlink) == 0)
+ deletion = 1;
+ err = insert_node(c, lnum, snod->offs, snod->len,
+ &snod->key, snod->sqnum, deletion,
+ &used, 0, new_size);
+ break;
+ }
+ case UBIFS_DATA_NODE:
+ {
+ struct ubifs_data_node *dn = snod->node;
+ loff_t new_size = le32_to_cpu(dn->size) +
+ key_block(c, &snod->key) *
+ UBIFS_BLOCK_SIZE;
+
+ err = insert_node(c, lnum, snod->offs, snod->len,
+ &snod->key, snod->sqnum, deletion,
+ &used, 0, new_size);
+ break;
+ }
+ case UBIFS_DENT_NODE:
+ case UBIFS_XENT_NODE:
+ {
+ struct ubifs_dent_node *dent = snod->node;
+
+ err = ubifs_validate_entry(c, dent);
+ if (err)
+ goto out_dump;
+
+ err = insert_dent(c, lnum, snod->offs, snod->len,
+ &snod->key, dent->name,
+ le16_to_cpu(dent->nlen), snod->sqnum,
+ !le64_to_cpu(dent->inum), &used);
+ break;
+ }
+ case UBIFS_TRUN_NODE:
+ {
+ struct ubifs_trun_node *trun = snod->node;
+ loff_t old_size = le64_to_cpu(trun->old_size);
+ loff_t new_size = le64_to_cpu(trun->new_size);
+ union ubifs_key key;
+
+ /* Validate truncation node */
+ if (old_size < 0 || old_size > c->max_inode_sz ||
+ new_size < 0 || new_size > c->max_inode_sz ||
+ old_size <= new_size) {
+ ubifs_err("bad truncation node");
+ goto out_dump;
+ }
+
+ /*
+ * Create a fake truncation key just to use the same
+ * functions which expect nodes to have keys.
+ */
+ trun_key_init(c, &key, le32_to_cpu(trun->inum));
+ err = insert_node(c, lnum, snod->offs, snod->len,
+ &key, snod->sqnum, 1, &used,
+ old_size, new_size);
+ break;
+ }
+ default:
+ ubifs_err("unexpected node type %d in bud LEB %d:%d",
+ snod->type, lnum, snod->offs);
+ err = -EINVAL;
+ goto out_dump;
+ }
+ if (err)
+ goto out;
+ }
+
+ bud = ubifs_search_bud(c, lnum);
+ if (!bud)
+ BUG();
+
+ ubifs_assert(sleb->endpt - offs >= used);
+ ubifs_assert(sleb->endpt % c->min_io_size == 0);
+
+ if (sleb->endpt + c->min_io_size <= c->leb_size &&
+ !(c->vfs_sb->s_flags & MS_RDONLY))
+ err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
+ sleb->endpt, UBI_SHORTTERM);
+
+ *dirty = sleb->endpt - offs - used;
+ *free = c->leb_size - sleb->endpt;
+
+out:
+ ubifs_scan_destroy(sleb);
+ return err;
+
+out_dump:
+ ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
+ dbg_dump_node(c, snod->node);
+ ubifs_scan_destroy(sleb);
+ return -EINVAL;
+}
+
+/**
+ * insert_ref_node - insert a reference node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @sqnum: sequence number
+ * @free: amount of free space in bud
+ * @dirty: amount of dirty space from padding and deletion nodes
+ *
+ * This function inserts a reference node to the replay tree and returns zero
+ * in case of success ort a negative error code in case of failure.
+ */
+static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
+ unsigned long long sqnum, int free, int dirty)
+{
+ struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+ struct replay_entry *r;
+
+ dbg_mnt("add ref LEB %d:%d", lnum, offs);
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct replay_entry, rb);
+ if (sqnum < r->sqnum) {
+ p = &(*p)->rb_left;
+ continue;
+ } else if (sqnum > r->sqnum) {
+ p = &(*p)->rb_right;
+ continue;
+ }
+ ubifs_err("duplicate sqnum in replay tree");
+ return -EINVAL;
+ }
+
+ r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->lnum = lnum;
+ r->offs = offs;
+ r->sqnum = sqnum;
+ r->flags = REPLAY_REF;
+ r->free = free;
+ r->dirty = dirty;
+
+ rb_link_node(&r->rb, parent, p);
+ rb_insert_color(&r->rb, &c->replay_tree);
+ return 0;
+}
+
+/**
+ * replay_buds - replay all buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_buds(struct ubifs_info *c)
+{
+ struct bud_entry *b;
+ int err, uninitialized_var(free), uninitialized_var(dirty);
+
+ list_for_each_entry(b, &c->replay_buds, list) {
+ err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+ &free, &dirty);
+ if (err)
+ return err;
+ err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
+ free, dirty);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * destroy_bud_list - destroy the list of buds to replay.
+ * @c: UBIFS file-system description object
+ */
+static void destroy_bud_list(struct ubifs_info *c)
+{
+ struct bud_entry *b;
+
+ while (!list_empty(&c->replay_buds)) {
+ b = list_entry(c->replay_buds.next, struct bud_entry, list);
+ list_del(&b->list);
+ kfree(b);
+ }
+}
+
+/**
+ * add_replay_bud - add a bud to the list of buds to replay.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @sqnum: reference node sequence number
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+ unsigned long long sqnum)
+{
+ struct ubifs_bud *bud;
+ struct bud_entry *b;
+
+ dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
+
+ bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
+ if (!bud)
+ return -ENOMEM;
+
+ b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
+ if (!b) {
+ kfree(bud);
+ return -ENOMEM;
+ }
+
+ bud->lnum = lnum;
+ bud->start = offs;
+ bud->jhead = jhead;
+ ubifs_add_bud(c, bud);
+
+ b->bud = bud;
+ b->sqnum = sqnum;
+ list_add_tail(&b->list, &c->replay_buds);
+
+ return 0;
+}
+
+/**
+ * validate_ref - validate a reference node.
+ * @c: UBIFS file-system description object
+ * @ref: the reference node to validate
+ * @ref_lnum: LEB number of the reference node
+ * @ref_offs: reference node offset
+ *
+ * This function returns %1 if a bud reference already exists for the LEB. %0 is
+ * returned if the reference node is new, otherwise %-EINVAL is returned if
+ * validation failed.
+ */
+static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
+{
+ struct ubifs_bud *bud;
+ int lnum = le32_to_cpu(ref->lnum);
+ unsigned int offs = le32_to_cpu(ref->offs);
+ unsigned int jhead = le32_to_cpu(ref->jhead);
+
+ /*
+ * ref->offs may point to the end of LEB when the journal head points
+ * to the end of LEB and we write reference node for it during commit.
+ * So this is why we require 'offs > c->leb_size'.
+ */
+ if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
+ lnum < c->main_first || offs > c->leb_size ||
+ offs & (c->min_io_size - 1))
+ return -EINVAL;
+
+ /* Make sure we have not already looked at this bud */
+ bud = ubifs_search_bud(c, lnum);
+ if (bud) {
+ if (bud->jhead == jhead && bud->start <= offs)
+ return 1;
+ ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * replay_log_leb - replay a log logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: log logical eraseblock to replay
+ * @offs: offset to start replaying from
+ * @sbuf: scan buffer
+ *
+ * This function replays a log LEB and returns zero in case of success, %1 if
+ * this is the last LEB in the log, and a negative error code in case of
+ * failure.
+ */
+static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+ int err;
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ const struct ubifs_cs_node *node;
+
+ dbg_mnt("replay log LEB %d:%d", lnum, offs);
+ sleb = ubifs_scan(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb)) {
+ if (c->need_recovery)
+ sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ }
+
+ if (sleb->nodes_cnt == 0) {
+ err = 1;
+ goto out;
+ }
+
+ node = sleb->buf;
+
+ snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+ if (c->cs_sqnum == 0) {
+ /*
+ * This is the first log LEB we are looking at, make sure that
+ * the first node is a commit start node. Also record its
+ * sequence number so that UBIFS can determine where the log
+ * ends, because all nodes which were have higher sequence
+ * numbers.
+ */
+ if (snod->type != UBIFS_CS_NODE) {
+ dbg_err("first log node at LEB %d:%d is not CS node",
+ lnum, offs);
+ goto out_dump;
+ }
+ if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
+ dbg_err("first CS node at LEB %d:%d has wrong "
+ "commit number %llu expected %llu",
+ lnum, offs,
+ (unsigned long long)le64_to_cpu(node->cmt_no),
+ c->cmt_no);
+ goto out_dump;
+ }
+
+ c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
+ dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+ }
+
+ if (snod->sqnum < c->cs_sqnum) {
+ /*
+ * This means that we reached end of log and now
+ * look to the older log data, which was already
+ * committed but the eraseblock was not erased (UBIFS
+ * only unmaps it). So this basically means we have to
+ * exit with "end of log" code.
+ */
+ err = 1;
+ goto out;
+ }
+
+ /* Make sure the first node sits at offset zero of the LEB */
+ if (snod->offs != 0) {
+ dbg_err("first node is not at zero offset");
+ goto out_dump;
+ }
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+
+ cond_resched();
+
+ if (snod->sqnum >= SQNUM_WATERMARK) {
+ ubifs_err("file system's life ended");
+ goto out_dump;
+ }
+
+ if (snod->sqnum < c->cs_sqnum) {
+ dbg_err("bad sqnum %llu, commit sqnum %llu",
+ snod->sqnum, c->cs_sqnum);
+ goto out_dump;
+ }
+
+ if (snod->sqnum > c->max_sqnum)
+ c->max_sqnum = snod->sqnum;
+
+ switch (snod->type) {
+ case UBIFS_REF_NODE: {
+ const struct ubifs_ref_node *ref = snod->node;
+
+ err = validate_ref(c, ref);
+ if (err == 1)
+ break; /* Already have this bud */
+ if (err)
+ goto out_dump;
+
+ err = add_replay_bud(c, le32_to_cpu(ref->lnum),
+ le32_to_cpu(ref->offs),
+ le32_to_cpu(ref->jhead),
+ snod->sqnum);
+ if (err)
+ goto out;
+
+ break;
+ }
+ case UBIFS_CS_NODE:
+ /* Make sure it sits at the beginning of LEB */
+ if (snod->offs != 0) {
+ ubifs_err("unexpected node in log");
+ goto out_dump;
+ }
+ break;
+ default:
+ ubifs_err("unexpected node in log");
+ goto out_dump;
+ }
+ }
+
+ if (sleb->endpt || c->lhead_offs >= c->leb_size) {
+ c->lhead_lnum = lnum;
+ c->lhead_offs = sleb->endpt;
+ }
+
+ err = !sleb->endpt;
+out:
+ ubifs_scan_destroy(sleb);
+ return err;
+
+out_dump:
+ ubifs_err("log error detected while replying the log at LEB %d:%d",
+ lnum, offs + snod->offs);
+ dbg_dump_node(c, snod->node);
+ ubifs_scan_destroy(sleb);
+ return -EINVAL;
+}
+
+/**
+ * take_ihead - update the status of the index head in lprops to 'taken'.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the amount of free space in the index head LEB or a
+ * negative error code.
+ */
+static int take_ihead(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lp;
+ int err, free;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ free = lp->free;
+
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ err = free;
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_replay_journal - replay journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the journal, replays and cleans it up. It makes sure all
+ * memory data structures related to uncommitted journal are built (dirty TNC
+ * tree, tree of buds, modified lprops, etc).
+ */
+int ubifs_replay_journal(struct ubifs_info *c)
+{
+ int err, i, lnum, offs, free;
+ void *sbuf = NULL;
+
+ BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
+
+ /* Update the status of the index head in lprops to 'taken' */
+ free = take_ihead(c);
+ if (free < 0)
+ return free; /* Error code */
+
+ if (c->ihead_offs != c->leb_size - free) {
+ ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
+ c->ihead_offs);
+ return -EINVAL;
+ }
+
+ sbuf = vmalloc(c->leb_size);
+ if (!sbuf)
+ return -ENOMEM;
+
+ dbg_mnt("start replaying the journal");
+
+ c->replaying = 1;
+
+ lnum = c->ltail_lnum = c->lhead_lnum;
+ offs = c->lhead_offs;
+
+ for (i = 0; i < c->log_lebs; i++, lnum++) {
+ if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
+ /*
+ * The log is logically circular, we reached the last
+ * LEB, switch to the first one.
+ */
+ lnum = UBIFS_LOG_LNUM;
+ offs = 0;
+ }
+ err = replay_log_leb(c, lnum, offs, sbuf);
+ if (err == 1)
+ /* We hit the end of the log */
+ break;
+ if (err)
+ goto out;
+ offs = 0;
+ }
+
+ err = replay_buds(c);
+ if (err)
+ goto out;
+
+ err = apply_replay_tree(c);
+ if (err)
+ goto out;
+
+ ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
+ dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
+ "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+ c->highest_inum);
+out:
+ destroy_replay_tree(c);
+ destroy_bud_list(c);
+ vfree(sbuf);
+ c->replaying = 0;
+ return err;
+}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 00000000000..2bf753b3888
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,629 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS superblock. The superblock is stored at the first
+ * LEB of the volume and is never changed by UBIFS. Only user-space tools may
+ * change it. The superblock node mostly contains geometry information.
+ */
+
+#include "ubifs.h"
+#include <linux/random.h>
+
+/*
+ * Default journal size in logical eraseblocks as a percent of total
+ * flash size.
+ */
+#define DEFAULT_JNL_PERCENT 5
+
+/* Default maximum journal size in bytes */
+#define DEFAULT_MAX_JNL (32*1024*1024)
+
+/* Default indexing tree fanout */
+#define DEFAULT_FANOUT 8
+
+/* Default number of data journal heads */
+#define DEFAULT_JHEADS_CNT 1
+
+/* Default positions of different LEBs in the main area */
+#define DEFAULT_IDX_LEB 0
+#define DEFAULT_DATA_LEB 1
+#define DEFAULT_GC_LEB 2
+
+/* Default number of LEB numbers in LPT's save table */
+#define DEFAULT_LSAVE_CNT 256
+
+/* Default reserved pool size as a percent of maximum free space */
+#define DEFAULT_RP_PERCENT 5
+
+/* The default maximum size of reserved pool in bytes */
+#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
+
+/* Default time granularity in nanoseconds */
+#define DEFAULT_TIME_GRAN 1000000000
+
+/**
+ * create_default_filesystem - format empty UBI volume.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates default empty file-system. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int create_default_filesystem(struct ubifs_info *c)
+{
+ struct ubifs_sb_node *sup;
+ struct ubifs_mst_node *mst;
+ struct ubifs_idx_node *idx;
+ struct ubifs_branch *br;
+ struct ubifs_ino_node *ino;
+ struct ubifs_cs_node *cs;
+ union ubifs_key key;
+ int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
+ int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
+ int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+ uint64_t tmp64, main_bytes;
+
+ /* Some functions called from here depend on the @c->key_len filed */
+ c->key_len = UBIFS_SK_LEN;
+
+ /*
+ * First of all, we have to calculate default file-system geometry -
+ * log size, journal size, etc.
+ */
+ if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
+ /* We can first multiply then divide and have no overflow */
+ jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
+ else
+ jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
+
+ if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
+ jnl_lebs = UBIFS_MIN_JNL_LEBS;
+ if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
+ jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
+
+ /*
+ * The log should be large enough to fit reference nodes for all bud
+ * LEBs. Because buds do not have to start from the beginning of LEBs
+ * (half of the LEB may contain committed data), the log should
+ * generally be larger, make it twice as large.
+ */
+ tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
+ log_lebs = tmp / c->leb_size;
+ /* Plus one LEB reserved for commit */
+ log_lebs += 1;
+ if (c->leb_cnt - min_leb_cnt > 8) {
+ /* And some extra space to allow writes while committing */
+ log_lebs += 1;
+ min_leb_cnt += 1;
+ }
+
+ max_buds = jnl_lebs - log_lebs;
+ if (max_buds < UBIFS_MIN_BUD_LEBS)
+ max_buds = UBIFS_MIN_BUD_LEBS;
+
+ /*
+ * Orphan nodes are stored in a separate area. One node can store a lot
+ * of orphan inode numbers, but when new orphan comes we just add a new
+ * orphan node. At some point the nodes are consolidated into one
+ * orphan node.
+ */
+ orph_lebs = UBIFS_MIN_ORPH_LEBS;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ if (c->leb_cnt - min_leb_cnt > 1)
+ /*
+ * For debugging purposes it is better to have at least 2
+ * orphan LEBs, because the orphan subsystem would need to do
+ * consolidations and would be stressed more.
+ */
+ orph_lebs += 1;
+#endif
+
+ main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
+ main_lebs -= orph_lebs;
+
+ lpt_first = UBIFS_LOG_LNUM + log_lebs;
+ c->lsave_cnt = DEFAULT_LSAVE_CNT;
+ c->max_leb_cnt = c->leb_cnt;
+ err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
+ &big_lpt);
+ if (err)
+ return err;
+
+ dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
+ lpt_first + lpt_lebs - 1);
+
+ main_first = c->leb_cnt - main_lebs;
+
+ /* Create default superblock */
+ tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+ sup = kzalloc(tmp, GFP_KERNEL);
+ if (!sup)
+ return -ENOMEM;
+
+ tmp64 = (uint64_t)max_buds * c->leb_size;
+ if (big_lpt)
+ sup_flags |= UBIFS_FLG_BIGLPT;
+
+ sup->ch.node_type = UBIFS_SB_NODE;
+ sup->key_hash = UBIFS_KEY_HASH_R5;
+ sup->flags = cpu_to_le32(sup_flags);
+ sup->min_io_size = cpu_to_le32(c->min_io_size);
+ sup->leb_size = cpu_to_le32(c->leb_size);
+ sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+ sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt);
+ sup->max_bud_bytes = cpu_to_le64(tmp64);
+ sup->log_lebs = cpu_to_le32(log_lebs);
+ sup->lpt_lebs = cpu_to_le32(lpt_lebs);
+ sup->orph_lebs = cpu_to_le32(orph_lebs);
+ sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);
+ sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
+ sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
+ sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
+ sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+ sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
+
+ generate_random_uuid(sup->uuid);
+
+ main_bytes = (uint64_t)main_lebs * c->leb_size;
+ tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+ do_div(tmp64, 100);
+ if (tmp64 > DEFAULT_MAX_RP_SIZE)
+ tmp64 = DEFAULT_MAX_RP_SIZE;
+ sup->rp_size = cpu_to_le64(tmp64);
+
+ err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+ kfree(sup);
+ if (err)
+ return err;
+
+ dbg_gen("default superblock created at LEB 0:0");
+
+ /* Create default master node */
+ mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+ if (!mst)
+ return -ENOMEM;
+
+ mst->ch.node_type = UBIFS_MST_NODE;
+ mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);
+ mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
+ mst->cmt_no = 0;
+ mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+ mst->root_offs = 0;
+ tmp = ubifs_idx_node_sz(c, 1);
+ mst->root_len = cpu_to_le32(tmp);
+ mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB);
+ mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+ mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size));
+ mst->index_size = cpu_to_le64(ALIGN(tmp, 8));
+ mst->lpt_lnum = cpu_to_le32(c->lpt_lnum);
+ mst->lpt_offs = cpu_to_le32(c->lpt_offs);
+ mst->nhead_lnum = cpu_to_le32(c->nhead_lnum);
+ mst->nhead_offs = cpu_to_le32(c->nhead_offs);
+ mst->ltab_lnum = cpu_to_le32(c->ltab_lnum);
+ mst->ltab_offs = cpu_to_le32(c->ltab_offs);
+ mst->lsave_lnum = cpu_to_le32(c->lsave_lnum);
+ mst->lsave_offs = cpu_to_le32(c->lsave_offs);
+ mst->lscan_lnum = cpu_to_le32(main_first);
+ mst->empty_lebs = cpu_to_le32(main_lebs - 2);
+ mst->idx_lebs = cpu_to_le32(1);
+ mst->leb_cnt = cpu_to_le32(c->leb_cnt);
+
+ /* Calculate lprops statistics */
+ tmp64 = main_bytes;
+ tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+ tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+ mst->total_free = cpu_to_le64(tmp64);
+
+ tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+ ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
+ UBIFS_INO_NODE_SZ;
+ tmp64 += ino_waste;
+ tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
+ mst->total_dirty = cpu_to_le64(tmp64);
+
+ /* The indexing LEB does not contribute to dark space */
+ tmp64 = (c->main_lebs - 1) * c->dark_wm;
+ mst->total_dark = cpu_to_le64(tmp64);
+
+ mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+ err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+ UBI_UNKNOWN);
+ if (err) {
+ kfree(mst);
+ return err;
+ }
+ err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
+ UBI_UNKNOWN);
+ kfree(mst);
+ if (err)
+ return err;
+
+ dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
+
+ /* Create the root indexing node */
+ tmp = ubifs_idx_node_sz(c, 1);
+ idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+ if (!idx)
+ return -ENOMEM;
+
+ c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
+ c->key_hash = key_r5_hash;
+
+ idx->ch.node_type = UBIFS_IDX_NODE;
+ idx->child_cnt = cpu_to_le16(1);
+ ino_key_init(c, &key, UBIFS_ROOT_INO);
+ br = ubifs_idx_branch(c, idx, 0);
+ key_write_idx(c, &key, &br->key);
+ br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
+ br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);
+ err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
+ UBI_UNKNOWN);
+ kfree(idx);
+ if (err)
+ return err;
+
+ dbg_gen("default root indexing node created LEB %d:0",
+ main_first + DEFAULT_IDX_LEB);
+
+ /* Create default root inode */
+ tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+ ino = kzalloc(tmp, GFP_KERNEL);
+ if (!ino)
+ return -ENOMEM;
+
+ ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
+ ino->ch.node_type = UBIFS_INO_NODE;
+ ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
+ ino->nlink = cpu_to_le32(2);
+ tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+ ino->atime_sec = tmp;
+ ino->ctime_sec = tmp;
+ ino->mtime_sec = tmp;
+ ino->atime_nsec = 0;
+ ino->ctime_nsec = 0;
+ ino->mtime_nsec = 0;
+ ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+ ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+ /* Set compression enabled by default */
+ ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
+
+ err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+ main_first + DEFAULT_DATA_LEB, 0,
+ UBI_UNKNOWN);
+ kfree(ino);
+ if (err)
+ return err;
+
+ dbg_gen("root inode created at LEB %d:0",
+ main_first + DEFAULT_DATA_LEB);
+
+ /*
+ * The first node in the log has to be the commit start node. This is
+ * always the case during normal file-system operation. Write a fake
+ * commit start node to the log.
+ */
+ tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
+ cs = kzalloc(tmp, GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
+ cs->ch.node_type = UBIFS_CS_NODE;
+ err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
+ 0, UBI_UNKNOWN);
+ kfree(cs);
+
+ ubifs_msg("default file-system created");
+ return 0;
+}
+
+/**
+ * validate_sb - validate superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node
+ *
+ * This function validates superblock node @sup. Since most of data was read
+ * from the superblock and stored in @c, the function validates fields in @c
+ * instead. Returns zero in case of success and %-EINVAL in case of validation
+ * failure.
+ */
+static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+ long long max_bytes;
+ int err = 1, min_leb_cnt;
+
+ if (!c->key_hash) {
+ err = 2;
+ goto failed;
+ }
+
+ if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
+ err = 3;
+ goto failed;
+ }
+
+ if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
+ ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
+ le32_to_cpu(sup->min_io_size), c->min_io_size);
+ goto failed;
+ }
+
+ if (le32_to_cpu(sup->leb_size) != c->leb_size) {
+ ubifs_err("LEB size mismatch: %d in superblock, %d real",
+ le32_to_cpu(sup->leb_size), c->leb_size);
+ goto failed;
+ }
+
+ if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
+ c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
+ c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
+ c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+ err = 4;
+ goto failed;
+ }
+
+ /*
+ * Calculate minimum allowed amount of main area LEBs. This is very
+ * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
+ * have just read from the superblock.
+ */
+ min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
+ min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
+
+ if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
+ ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
+ "%d minimum required", c->leb_cnt, c->vi.size,
+ min_leb_cnt);
+ goto failed;
+ }
+
+ if (c->max_leb_cnt < c->leb_cnt) {
+ ubifs_err("max. LEB count %d less than LEB count %d",
+ c->max_leb_cnt, c->leb_cnt);
+ goto failed;
+ }
+
+ if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+ err = 7;
+ goto failed;
+ }
+
+ if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
+ c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
+ err = 8;
+ goto failed;
+ }
+
+ if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
+ c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
+ err = 9;
+ goto failed;
+ }
+
+ if (c->fanout < UBIFS_MIN_FANOUT ||
+ ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
+ err = 10;
+ goto failed;
+ }
+
+ if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
+ c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
+ c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
+ err = 11;
+ goto failed;
+ }
+
+ if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
+ c->orph_lebs + c->main_lebs != c->leb_cnt) {
+ err = 12;
+ goto failed;
+ }
+
+ if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+ err = 13;
+ goto failed;
+ }
+
+ max_bytes = c->main_lebs * (long long)c->leb_size;
+ if (c->rp_size < 0 || max_bytes < c->rp_size) {
+ err = 14;
+ goto failed;
+ }
+
+ if (le32_to_cpu(sup->time_gran) > 1000000000 ||
+ le32_to_cpu(sup->time_gran) < 1) {
+ err = 15;
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ ubifs_err("bad superblock, error %d", err);
+ dbg_dump_node(c, sup);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_read_sb_node - read superblock node.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a pointer to the superblock node or a negative error
+ * code.
+ */
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+{
+ struct ubifs_sb_node *sup;
+ int err;
+
+ sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
+ if (!sup)
+ return ERR_PTR(-ENOMEM);
+
+ err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
+ UBIFS_SB_LNUM, 0);
+ if (err) {
+ kfree(sup);
+ return ERR_PTR(err);
+ }
+
+ return sup;
+}
+
+/**
+ * ubifs_write_sb_node - write superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node read with 'ubifs_read_sb_node()'
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+ int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+
+ ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+ return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+}
+
+/**
+ * ubifs_read_superblock - read superblock.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds, reads and checks the superblock. If an empty UBI volume
+ * is being mounted, this function creates default superblock. Returns zero in
+ * case of success, and a negative error code in case of failure.
+ */
+int ubifs_read_superblock(struct ubifs_info *c)
+{
+ int err, sup_flags;
+ struct ubifs_sb_node *sup;
+
+ if (c->empty) {
+ err = create_default_filesystem(c);
+ if (err)
+ return err;
+ }
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup))
+ return PTR_ERR(sup);
+
+ /*
+ * The software supports all previous versions but not future versions,
+ * due to the unavailability of time-travelling equipment.
+ */
+ c->fmt_version = le32_to_cpu(sup->fmt_version);
+ if (c->fmt_version > UBIFS_FORMAT_VERSION) {
+ ubifs_err("on-flash format version is %d, but software only "
+ "supports up to version %d", c->fmt_version,
+ UBIFS_FORMAT_VERSION);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (c->fmt_version < 3) {
+ ubifs_err("on-flash format version %d is not supported",
+ c->fmt_version);
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (sup->key_hash) {
+ case UBIFS_KEY_HASH_R5:
+ c->key_hash = key_r5_hash;
+ c->key_hash_type = UBIFS_KEY_HASH_R5;
+ break;
+
+ case UBIFS_KEY_HASH_TEST:
+ c->key_hash = key_test_hash;
+ c->key_hash_type = UBIFS_KEY_HASH_TEST;
+ break;
+ };
+
+ c->key_fmt = sup->key_fmt;
+
+ switch (c->key_fmt) {
+ case UBIFS_SIMPLE_KEY_FMT:
+ c->key_len = UBIFS_SK_LEN;
+ break;
+ default:
+ ubifs_err("unsupported key format");
+ err = -EINVAL;
+ goto out;
+ }
+
+ c->leb_cnt = le32_to_cpu(sup->leb_cnt);
+ c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt);
+ c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
+ c->log_lebs = le32_to_cpu(sup->log_lebs);
+ c->lpt_lebs = le32_to_cpu(sup->lpt_lebs);
+ c->orph_lebs = le32_to_cpu(sup->orph_lebs);
+ c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
+ c->fanout = le32_to_cpu(sup->fanout);
+ c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
+ c->default_compr = le16_to_cpu(sup->default_compr);
+ c->rp_size = le64_to_cpu(sup->rp_size);
+ c->rp_uid = le32_to_cpu(sup->rp_uid);
+ c->rp_gid = le32_to_cpu(sup->rp_gid);
+ sup_flags = le32_to_cpu(sup->flags);
+
+ c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
+
+ memcpy(&c->uuid, &sup->uuid, 16);
+
+ c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+
+ /* Automatically increase file system size to the maximum size */
+ c->old_leb_cnt = c->leb_cnt;
+ if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+ c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
+ if (c->vfs_sb->s_flags & MS_RDONLY)
+ dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
+ c->old_leb_cnt, c->leb_cnt);
+ else {
+ dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
+ c->old_leb_cnt, c->leb_cnt);
+ sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+ err = ubifs_write_sb_node(c, sup);
+ if (err)
+ goto out;
+ c->old_leb_cnt = c->leb_cnt;
+ }
+ }
+
+ c->log_bytes = (long long)c->log_lebs * c->leb_size;
+ c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
+ c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
+ c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
+ c->orph_first = c->lpt_last + 1;
+ c->orph_last = c->orph_first + c->orph_lebs - 1;
+ c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
+ c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
+ c->main_first = c->leb_cnt - c->main_lebs;
+ c->report_rp_size = ubifs_reported_space(c, c->rp_size);
+
+ err = validate_sb(c, sup);
+out:
+ kfree(sup);
+ return err;
+}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 00000000000..acf5c5fffc6
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,362 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the scan which is a general-purpose function for
+ * determining what nodes are in an eraseblock. The scan is used to replay the
+ * journal, to do garbage collection. for the TNC in-the-gaps method, and by
+ * debugging functions.
+ */
+
+#include "ubifs.h"
+
+/**
+ * scan_padding_bytes - scan for padding bytes.
+ * @buf: buffer to scan
+ * @len: length of buffer
+ *
+ * This function returns the number of padding bytes on success and
+ * %SCANNED_GARBAGE on failure.
+ */
+static int scan_padding_bytes(void *buf, int len)
+{
+ int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
+ uint8_t *p = buf;
+
+ dbg_scan("not a node");
+
+ while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
+ pad_len += 1;
+
+ if (!pad_len || (pad_len & 7))
+ return SCANNED_GARBAGE;
+
+ dbg_scan("%d padding bytes", pad_len);
+
+ return pad_len;
+}
+
+/**
+ * ubifs_scan_a_node - scan for a node or padding.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to scan
+ * @len: length of buffer
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function returns a scanning code to indicate what was scanned.
+ */
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int quiet)
+{
+ struct ubifs_ch *ch = buf;
+ uint32_t magic;
+
+ magic = le32_to_cpu(ch->magic);
+
+ if (magic == 0xFFFFFFFF) {
+ dbg_scan("hit empty space");
+ return SCANNED_EMPTY_SPACE;
+ }
+
+ if (magic != UBIFS_NODE_MAGIC)
+ return scan_padding_bytes(buf, len);
+
+ if (len < UBIFS_CH_SZ)
+ return SCANNED_GARBAGE;
+
+ dbg_scan("scanning %s", dbg_ntype(ch->node_type));
+
+ if (ubifs_check_node(c, buf, lnum, offs, quiet))
+ return SCANNED_A_CORRUPT_NODE;
+
+ if (ch->node_type == UBIFS_PAD_NODE) {
+ struct ubifs_pad_node *pad = buf;
+ int pad_len = le32_to_cpu(pad->pad_len);
+ int node_len = le32_to_cpu(ch->len);
+
+ /* Validate the padding node */
+ if (pad_len < 0 ||
+ offs + node_len + pad_len > c->leb_size) {
+ if (!quiet) {
+ ubifs_err("bad pad node at LEB %d:%d",
+ lnum, offs);
+ dbg_dump_node(c, pad);
+ }
+ return SCANNED_A_BAD_PAD_NODE;
+ }
+
+ /* Make the node pads to 8-byte boundary */
+ if ((node_len + pad_len) & 7) {
+ if (!quiet) {
+ dbg_err("bad padding length %d - %d",
+ offs, offs + node_len + pad_len);
+ }
+ return SCANNED_A_BAD_PAD_NODE;
+ }
+
+ dbg_scan("%d bytes padded, offset now %d",
+ pad_len, ALIGN(offs + node_len + pad_len, 8));
+
+ return node_len + pad_len;
+ }
+
+ return SCANNED_A_NODE;
+}
+
+/**
+ * ubifs_start_scan - create LEB scanning information at start of scan.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf)
+{
+ struct ubifs_scan_leb *sleb;
+ int err;
+
+ dbg_scan("scan LEB %d:%d", lnum, offs);
+
+ sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
+ if (!sleb)
+ return ERR_PTR(-ENOMEM);
+
+ sleb->lnum = lnum;
+ INIT_LIST_HEAD(&sleb->nodes);
+ sleb->buf = sbuf;
+
+ err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
+ if (err && err != -EBADMSG) {
+ ubifs_err("cannot read %d bytes from LEB %d:%d,"
+ " error %d", c->leb_size - offs, lnum, offs, err);
+ kfree(sleb);
+ return ERR_PTR(err);
+ }
+
+ if (err == -EBADMSG)
+ sleb->ecc = 1;
+
+ return sleb;
+}
+
+/**
+ * ubifs_end_scan - update LEB scanning information at end of scan.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ int lnum, int offs)
+{
+ lnum = lnum;
+ dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
+ ubifs_assert(offs % c->min_io_size == 0);
+
+ sleb->endpt = ALIGN(offs, c->min_io_size);
+}
+
+/**
+ * ubifs_add_snod - add a scanned node to LEB scanning information.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @buf: buffer containing node
+ * @offs: offset of node on flash
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ void *buf, int offs)
+{
+ struct ubifs_ch *ch = buf;
+ struct ubifs_ino_node *ino = buf;
+ struct ubifs_scan_node *snod;
+
+ snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+ if (!snod)
+ return -ENOMEM;
+
+ snod->sqnum = le64_to_cpu(ch->sqnum);
+ snod->type = ch->node_type;
+ snod->offs = offs;
+ snod->len = le32_to_cpu(ch->len);
+ snod->node = buf;
+
+ switch (ch->node_type) {
+ case UBIFS_INO_NODE:
+ case UBIFS_DENT_NODE:
+ case UBIFS_XENT_NODE:
+ case UBIFS_DATA_NODE:
+ case UBIFS_TRUN_NODE:
+ /*
+ * The key is in the same place in all keyed
+ * nodes.
+ */
+ key_read(c, &ino->key, &snod->key);
+ break;
+ }
+ list_add_tail(&snod->list, &sleb->nodes);
+ sleb->nodes_cnt += 1;
+ return 0;
+}
+
+/**
+ * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of corruption
+ * @offs: offset of corruption
+ * @buf: buffer containing corruption
+ */
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+ void *buf)
+{
+ int len;
+
+ ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
+ if (dbg_failure_mode)
+ return;
+ len = c->leb_size - offs;
+ if (len > 4096)
+ len = 4096;
+ dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
+}
+
+/**
+ * ubifs_scan - scan a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function scans LEB number @lnum and returns complete information about
+ * its contents. Returns an error code in case of failure.
+ */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf)
+{
+ void *buf = sbuf + offs;
+ int err, len = c->leb_size - offs;
+ struct ubifs_scan_leb *sleb;
+
+ sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb))
+ return sleb;
+
+ while (len >= 8) {
+ struct ubifs_ch *ch = buf;
+ int node_len, ret;
+
+ dbg_scan("look at LEB %d:%d (%d bytes left)",
+ lnum, offs, len);
+
+ cond_resched();
+
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+
+ if (ret > 0) {
+ /* Padding bytes or a valid padding node */
+ offs += ret;
+ buf += ret;
+ len -= ret;
+ continue;
+ }
+
+ if (ret == SCANNED_EMPTY_SPACE)
+ /* Empty space is checked later */
+ break;
+
+ switch (ret) {
+ case SCANNED_GARBAGE:
+ dbg_err("garbage");
+ goto corrupted;
+ case SCANNED_A_NODE:
+ break;
+ case SCANNED_A_CORRUPT_NODE:
+ case SCANNED_A_BAD_PAD_NODE:
+ dbg_err("bad node");
+ goto corrupted;
+ default:
+ dbg_err("unknown");
+ goto corrupted;
+ }
+
+ err = ubifs_add_snod(c, sleb, buf, offs);
+ if (err)
+ goto error;
+
+ node_len = ALIGN(le32_to_cpu(ch->len), 8);
+ offs += node_len;
+ buf += node_len;
+ len -= node_len;
+ }
+
+ if (offs % c->min_io_size)
+ goto corrupted;
+
+ ubifs_end_scan(c, sleb, lnum, offs);
+
+ for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
+ if (*(uint32_t *)buf != 0xffffffff)
+ break;
+ for (; len; offs++, buf++, len--)
+ if (*(uint8_t *)buf != 0xff) {
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
+ goto corrupted;
+ }
+
+ return sleb;
+
+corrupted:
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ err = -EUCLEAN;
+error:
+ ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+}
+
+/**
+ * ubifs_scan_destroy - destroy LEB scanning information.
+ * @sleb: scanning information to free
+ */
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
+{
+ struct ubifs_scan_node *node;
+ struct list_head *head;
+
+ head = &sleb->nodes;
+ while (!list_empty(head)) {
+ node = list_entry(head->next, struct ubifs_scan_node, list);
+ list_del(&node->list);
+ kfree(node);
+ }
+ kfree(sleb);
+}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 00000000000..f248533841a
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,322 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS shrinker which evicts clean znodes from the TNC
+ * tree when Linux VM needs more RAM.
+ *
+ * We do not implement any LRU lists to find oldest znodes to free because it
+ * would add additional overhead to the file system fast paths. So the shrinker
+ * just walks the TNC tree when searching for znodes to free.
+ *
+ * If the root of a TNC sub-tree is clean and old enough, then the children are
+ * also clean and old enough. So the shrinker walks the TNC in level order and
+ * dumps entire sub-trees.
+ *
+ * The age of znodes is just the time-stamp when they were last looked at.
+ * The current shrinker first tries to evict old znodes, then young ones.
+ *
+ * Since the shrinker is global, it has to protect against races with FS
+ * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
+ */
+
+#include "ubifs.h"
+
+/* List of all UBIFS file-system instances */
+LIST_HEAD(ubifs_infos);
+
+/*
+ * We number each shrinker run and record the number on the ubifs_info structure
+ * so that we can easily work out which ubifs_info structures have already been
+ * done by the current run.
+ */
+static unsigned int shrinker_run_no;
+
+/* Protects 'ubifs_infos' list */
+DEFINE_SPINLOCK(ubifs_infos_lock);
+
+/* Global clean znode counter (for all mounted UBIFS instances) */
+atomic_long_t ubifs_clean_zn_cnt;
+
+/**
+ * shrink_tnc - shrink TNC tree.
+ * @c: UBIFS file-system description object
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function traverses TNC tree and frees clean znodes. It does not free
+ * clean znodes which younger then @age. Returns number of freed znodes.
+ */
+static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
+{
+ int total_freed = 0;
+ struct ubifs_znode *znode, *zprev;
+ int time = get_seconds();
+
+ ubifs_assert(mutex_is_locked(&c->umount_mutex));
+ ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+
+ if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
+ return 0;
+
+ /*
+ * Traverse the TNC tree in levelorder manner, so that it is possible
+ * to destroy large sub-trees. Indeed, if a znode is old, then all its
+ * children are older or of the same age.
+ *
+ * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
+ * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
+ * changed only when the 'c->tnc_mutex' is held.
+ */
+ zprev = NULL;
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+ while (znode && total_freed < nr &&
+ atomic_long_read(&c->clean_zn_cnt) > 0) {
+ int freed;
+
+ /*
+ * If the znode is clean, but it is in the 'c->cnext' list, this
+ * means that this znode has just been written to flash as a
+ * part of commit and was marked clean. They will be removed
+ * from the list at end commit. We cannot change the list,
+ * because it is not protected by any mutex (design decision to
+ * make commit really independent and parallel to main I/O). So
+ * we just skip these znodes.
+ *
+ * Note, the 'clean_zn_cnt' counters are not updated until
+ * after the commit, so the UBIFS shrinker does not report
+ * the znodes which are in the 'c->cnext' list as freeable.
+ *
+ * Also note, if the root of a sub-tree is not in 'c->cnext',
+ * then the whole sub-tree is not in 'c->cnext' as well, so it
+ * is safe to dump whole sub-tree.
+ */
+
+ if (znode->cnext) {
+ /*
+ * Very soon these znodes will be removed from the list
+ * and become freeable.
+ */
+ *contention = 1;
+ } else if (!ubifs_zn_dirty(znode) &&
+ abs(time - znode->time) >= age) {
+ if (znode->parent)
+ znode->parent->zbranch[znode->iip].znode = NULL;
+ else
+ c->zroot.znode = NULL;
+
+ freed = ubifs_destroy_tnc_subtree(znode);
+ atomic_long_sub(freed, &ubifs_clean_zn_cnt);
+ atomic_long_sub(freed, &c->clean_zn_cnt);
+ ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
+ total_freed += freed;
+ znode = zprev;
+ }
+
+ if (unlikely(!c->zroot.znode))
+ break;
+
+ zprev = znode;
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+ cond_resched();
+ }
+
+ return total_freed;
+}
+
+/**
+ * shrink_tnc_trees - shrink UBIFS TNC trees.
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function walks the list of mounted UBIFS file-systems and frees clean
+ * znodes which are older then @age, until at least @nr znodes are freed.
+ * Returns the number of freed znodes.
+ */
+static int shrink_tnc_trees(int nr, int age, int *contention)
+{
+ struct ubifs_info *c;
+ struct list_head *p;
+ unsigned int run_no;
+ int freed = 0;
+
+ spin_lock(&ubifs_infos_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+ /* Iterate over all mounted UBIFS file-systems and try to shrink them */
+ p = ubifs_infos.next;
+ while (p != &ubifs_infos) {
+ c = list_entry(p, struct ubifs_info, infos_list);
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (c->shrinker_run_no == run_no)
+ break;
+ if (!mutex_trylock(&c->umount_mutex)) {
+ /* Some un-mount is in progress, try next FS */
+ *contention = 1;
+ p = p->next;
+ continue;
+ }
+ /*
+ * We're holding 'c->umount_mutex', so the file-system won't go
+ * away.
+ */
+ if (!mutex_trylock(&c->tnc_mutex)) {
+ mutex_unlock(&c->umount_mutex);
+ *contention = 1;
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&ubifs_infos_lock);
+ /*
+ * OK, now we have TNC locked, the file-system cannot go away -
+ * it is safe to reap the cache.
+ */
+ c->shrinker_run_no = run_no;
+ freed += shrink_tnc(c, nr, age, contention);
+ mutex_unlock(&c->tnc_mutex);
+ spin_lock(&ubifs_infos_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_del(&c->infos_list);
+ list_add_tail(&c->infos_list, &ubifs_infos);
+ mutex_unlock(&c->umount_mutex);
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&ubifs_infos_lock);
+ return freed;
+}
+
+/**
+ * kick_a_thread - kick a background thread to start commit.
+ *
+ * This function kicks a background thread to start background commit. Returns
+ * %-1 if a thread was kicked or there is another reason to assume the memory
+ * will soon be freed or become freeable. If there are no dirty znodes, returns
+ * %0.
+ */
+static int kick_a_thread(void)
+{
+ int i;
+ struct ubifs_info *c;
+
+ /*
+ * Iterate over all mounted UBIFS file-systems and find out if there is
+ * already an ongoing commit operation there. If no, then iterate for
+ * the second time and initiate background commit.
+ */
+ spin_lock(&ubifs_infos_lock);
+ for (i = 0; i < 2; i++) {
+ list_for_each_entry(c, &ubifs_infos, infos_list) {
+ long dirty_zn_cnt;
+
+ if (!mutex_trylock(&c->umount_mutex)) {
+ /*
+ * Some un-mount is in progress, it will
+ * certainly free memory, so just return.
+ */
+ spin_unlock(&ubifs_infos_lock);
+ return -1;
+ }
+
+ dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
+
+ if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
+ c->ro_media) {
+ mutex_unlock(&c->umount_mutex);
+ continue;
+ }
+
+ if (c->cmt_state != COMMIT_RESTING) {
+ spin_unlock(&ubifs_infos_lock);
+ mutex_unlock(&c->umount_mutex);
+ return -1;
+ }
+
+ if (i == 1) {
+ list_del(&c->infos_list);
+ list_add_tail(&c->infos_list, &ubifs_infos);
+ spin_unlock(&ubifs_infos_lock);
+
+ ubifs_request_bg_commit(c);
+ mutex_unlock(&c->umount_mutex);
+ return -1;
+ }
+ mutex_unlock(&c->umount_mutex);
+ }
+ }
+ spin_unlock(&ubifs_infos_lock);
+
+ return 0;
+}
+
+int ubifs_shrinker(int nr, gfp_t gfp_mask)
+{
+ int freed, contention = 0;
+ long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+
+ if (nr == 0)
+ return clean_zn_cnt;
+
+ if (!clean_zn_cnt) {
+ /*
+ * No clean znodes, nothing to reap. All we can do in this case
+ * is to kick background threads to start commit, which will
+ * probably make clean znodes which, in turn, will be freeable.
+ * And we return -1 which means will make VM call us again
+ * later.
+ */
+ dbg_tnc("no clean znodes, kick a thread");
+ return kick_a_thread();
+ }
+
+ freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
+ if (freed >= nr)
+ goto out;
+
+ dbg_tnc("not enough old znodes, try to free young ones");
+ freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
+ if (freed >= nr)
+ goto out;
+
+ dbg_tnc("not enough young znodes, free all");
+ freed += shrink_tnc_trees(nr - freed, 0, &contention);
+
+ if (!freed && contention) {
+ dbg_tnc("freed nothing, but contention");
+ return -1;
+ }
+
+out:
+ dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+ return freed;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 00000000000..00eb9c68ad0
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,1951 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS initialization and VFS superblock operations. Some
+ * initialization stuff which is rather large and complex is placed at
+ * corresponding subsystems, but most of it is here.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/* Slab cache for UBIFS inodes */
+struct kmem_cache *ubifs_inode_slab;
+
+/* UBIFS TNC shrinker description */
+static struct shrinker ubifs_shrinker_info = {
+ .shrink = ubifs_shrinker,
+ .seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * validate_inode - validate inode.
+ * @c: UBIFS file-system description object
+ * @inode: the inode to validate
+ *
+ * This is a helper function for 'ubifs_iget()' which validates various fields
+ * of a newly built inode to make sure they contain sane values and prevent
+ * possible vulnerabilities. Returns zero if the inode is all right and
+ * a non-zero error code if not.
+ */
+static int validate_inode(struct ubifs_info *c, const struct inode *inode)
+{
+ int err;
+ const struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (inode->i_size > c->max_inode_sz) {
+ ubifs_err("inode is too large (%lld)",
+ (long long)inode->i_size);
+ return 1;
+ }
+
+ if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+ ubifs_err("unknown compression type %d", ui->compr_type);
+ return 2;
+ }
+
+ if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
+ return 3;
+
+ if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
+ return 4;
+
+ if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
+ return 5;
+
+ if (!ubifs_compr_present(ui->compr_type)) {
+ ubifs_warn("inode %lu uses '%s' compression, but it was not "
+ "compiled in", inode->i_ino,
+ ubifs_compr_name(ui->compr_type));
+ }
+
+ err = dbg_check_dir_size(c, inode);
+ return err;
+}
+
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
+{
+ int err;
+ union ubifs_key key;
+ struct ubifs_ino_node *ino;
+ struct ubifs_info *c = sb->s_fs_info;
+ struct inode *inode;
+ struct ubifs_inode *ui;
+
+ dbg_gen("inode %lu", inum);
+
+ inode = iget_locked(sb, inum);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+ ui = ubifs_inode(inode);
+
+ ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+ if (!ino) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ino_key_init(c, &key, inode->i_ino);
+
+ err = ubifs_tnc_lookup(c, &key, ino);
+ if (err)
+ goto out_ino;
+
+ inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+ inode->i_nlink = le32_to_cpu(ino->nlink);
+ inode->i_uid = le32_to_cpu(ino->uid);
+ inode->i_gid = le32_to_cpu(ino->gid);
+ inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
+ inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
+ inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+ inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+ inode->i_mode = le32_to_cpu(ino->mode);
+ inode->i_size = le64_to_cpu(ino->size);
+
+ ui->data_len = le32_to_cpu(ino->data_len);
+ ui->flags = le32_to_cpu(ino->flags);
+ ui->compr_type = le16_to_cpu(ino->compr_type);
+ ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
+ ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+ ui->xattr_size = le32_to_cpu(ino->xattr_size);
+ ui->xattr_names = le32_to_cpu(ino->xattr_names);
+ ui->synced_i_size = ui->ui_size = inode->i_size;
+
+ ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
+
+ err = validate_inode(c, inode);
+ if (err)
+ goto out_invalid;
+
+ /* Disable readahead */
+ inode->i_mapping->backing_dev_info = &c->bdi;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &ubifs_file_address_operations;
+ inode->i_op = &ubifs_file_inode_operations;
+ inode->i_fop = &ubifs_file_operations;
+ if (ui->xattr) {
+ ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_ino;
+ }
+ memcpy(ui->data, ino->data, ui->data_len);
+ ((char *)ui->data)[ui->data_len] = '\0';
+ } else if (ui->data_len != 0) {
+ err = 10;
+ goto out_invalid;
+ }
+ break;
+ case S_IFDIR:
+ inode->i_op = &ubifs_dir_inode_operations;
+ inode->i_fop = &ubifs_dir_operations;
+ if (ui->data_len != 0) {
+ err = 11;
+ goto out_invalid;
+ }
+ break;
+ case S_IFLNK:
+ inode->i_op = &ubifs_symlink_inode_operations;
+ if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
+ err = 12;
+ goto out_invalid;
+ }
+ ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_ino;
+ }
+ memcpy(ui->data, ino->data, ui->data_len);
+ ((char *)ui->data)[ui->data_len] = '\0';
+ break;
+ case S_IFBLK:
+ case S_IFCHR:
+ {
+ dev_t rdev;
+ union ubifs_dev_desc *dev;
+
+ ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_ino;
+ }
+
+ dev = (union ubifs_dev_desc *)ino->data;
+ if (ui->data_len == sizeof(dev->new))
+ rdev = new_decode_dev(le32_to_cpu(dev->new));
+ else if (ui->data_len == sizeof(dev->huge))
+ rdev = huge_decode_dev(le64_to_cpu(dev->huge));
+ else {
+ err = 13;
+ goto out_invalid;
+ }
+ memcpy(ui->data, ino->data, ui->data_len);
+ inode->i_op = &ubifs_file_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ break;
+ }
+ case S_IFSOCK:
+ case S_IFIFO:
+ inode->i_op = &ubifs_file_inode_operations;
+ init_special_inode(inode, inode->i_mode, 0);
+ if (ui->data_len != 0) {
+ err = 14;
+ goto out_invalid;
+ }
+ break;
+ default:
+ err = 15;
+ goto out_invalid;
+ }
+
+ kfree(ino);
+ ubifs_set_inode_flags(inode);
+ unlock_new_inode(inode);
+ return inode;
+
+out_invalid:
+ ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
+ dbg_dump_node(c, ino);
+ dbg_dump_inode(c, inode);
+ err = -EINVAL;
+out_ino:
+ kfree(ino);
+out:
+ ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
+ iget_failed(inode);
+ return ERR_PTR(err);
+}
+
+static struct inode *ubifs_alloc_inode(struct super_block *sb)
+{
+ struct ubifs_inode *ui;
+
+ ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+ if (!ui)
+ return NULL;
+
+ memset((void *)ui + sizeof(struct inode), 0,
+ sizeof(struct ubifs_inode) - sizeof(struct inode));
+ mutex_init(&ui->ui_mutex);
+ spin_lock_init(&ui->ui_lock);
+ return &ui->vfs_inode;
+};
+
+static void ubifs_destroy_inode(struct inode *inode)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ kfree(ui->data);
+ kmem_cache_free(ubifs_inode_slab, inode);
+}
+
+/*
+ * Note, Linux write-back code calls this without 'i_mutex'.
+ */
+static int ubifs_write_inode(struct inode *inode, int wait)
+{
+ int err;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ubifs_assert(!ui->xattr);
+ if (is_bad_inode(inode))
+ return 0;
+
+ mutex_lock(&ui->ui_mutex);
+ /*
+ * Due to races between write-back forced by budgeting
+ * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+ * have already been synchronized, do not do this again. This might
+ * also happen if it was synchronized in an VFS operation, e.g.
+ * 'ubifs_link()'.
+ */
+ if (!ui->dirty) {
+ mutex_unlock(&ui->ui_mutex);
+ return 0;
+ }
+
+ dbg_gen("inode %lu", inode->i_ino);
+ err = ubifs_jnl_write_inode(c, inode, 0);
+ if (err)
+ ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+
+ ui->dirty = 0;
+ mutex_unlock(&ui->ui_mutex);
+ ubifs_release_dirty_inode_budget(c, ui);
+ return err;
+}
+
+static void ubifs_delete_inode(struct inode *inode)
+{
+ int err;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ if (ubifs_inode(inode)->xattr)
+ /*
+ * Extended attribute inode deletions are fully handled in
+ * 'ubifs_removexattr()'. These inodes are special and have
+ * limited usage, so there is nothing to do here.
+ */
+ goto out;
+
+ dbg_gen("inode %lu", inode->i_ino);
+ ubifs_assert(!atomic_read(&inode->i_count));
+ ubifs_assert(inode->i_nlink == 0);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ if (is_bad_inode(inode))
+ goto out;
+
+ ubifs_inode(inode)->ui_size = inode->i_size = 0;
+ err = ubifs_jnl_write_inode(c, inode, 1);
+ if (err)
+ /*
+ * Worst case we have a lost orphan inode wasting space, so a
+ * simple error message is ok here.
+ */
+ ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+out:
+ clear_inode(inode);
+}
+
+static void ubifs_dirty_inode(struct inode *inode)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+ if (!ui->dirty) {
+ ui->dirty = 1;
+ dbg_gen("inode %lu", inode->i_ino);
+ }
+}
+
+static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ubifs_info *c = dentry->d_sb->s_fs_info;
+ unsigned long long free;
+
+ free = ubifs_budg_get_free_space(c);
+ dbg_gen("free space %lld bytes (%lld blocks)",
+ free, free >> UBIFS_BLOCK_SHIFT);
+
+ buf->f_type = UBIFS_SUPER_MAGIC;
+ buf->f_bsize = UBIFS_BLOCK_SIZE;
+ buf->f_blocks = c->block_cnt;
+ buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
+ if (free > c->report_rp_size)
+ buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
+ else
+ buf->f_bavail = 0;
+ buf->f_files = 0;
+ buf->f_ffree = 0;
+ buf->f_namelen = UBIFS_MAX_NLEN;
+
+ return 0;
+}
+
+static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+ struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+
+ if (c->mount_opts.unmount_mode == 2)
+ seq_printf(s, ",fast_unmount");
+ else if (c->mount_opts.unmount_mode == 1)
+ seq_printf(s, ",norm_unmount");
+
+ return 0;
+}
+
+static int ubifs_sync_fs(struct super_block *sb, int wait)
+{
+ struct ubifs_info *c = sb->s_fs_info;
+ int i, ret = 0, err;
+
+ if (c->jheads)
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err && !ret)
+ ret = err;
+ }
+ /*
+ * We ought to call sync for c->ubi but it does not have one. If it had
+ * it would in turn call mtd->sync, however mtd operations are
+ * synchronous anyway, so we don't lose any sleep here.
+ */
+ return ret;
+}
+
+/**
+ * init_constants_early - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This function initialize UBIFS constants which do not need the superblock to
+ * be read. It also checks that the UBI volume satisfies basic UBIFS
+ * requirements. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int init_constants_early(struct ubifs_info *c)
+{
+ if (c->vi.corrupted) {
+ ubifs_warn("UBI volume is corrupted - read-only mode");
+ c->ro_media = 1;
+ }
+
+ if (c->di.ro_mode) {
+ ubifs_msg("read-only UBI device");
+ c->ro_media = 1;
+ }
+
+ if (c->vi.vol_type == UBI_STATIC_VOLUME) {
+ ubifs_msg("static UBI volume - read-only mode");
+ c->ro_media = 1;
+ }
+
+ c->leb_cnt = c->vi.size;
+ c->leb_size = c->vi.usable_leb_size;
+ c->half_leb_size = c->leb_size / 2;
+ c->min_io_size = c->di.min_io_size;
+ c->min_io_shift = fls(c->min_io_size) - 1;
+
+ if (c->leb_size < UBIFS_MIN_LEB_SZ) {
+ ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
+ c->leb_size, UBIFS_MIN_LEB_SZ);
+ return -EINVAL;
+ }
+
+ if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+ ubifs_err("too few LEBs (%d), min. is %d",
+ c->leb_cnt, UBIFS_MIN_LEB_CNT);
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(c->min_io_size)) {
+ ubifs_err("bad min. I/O size %d", c->min_io_size);
+ return -EINVAL;
+ }
+
+ /*
+ * UBIFS aligns all node to 8-byte boundary, so to make function in
+ * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
+ * less than 8.
+ */
+ if (c->min_io_size < 8) {
+ c->min_io_size = 8;
+ c->min_io_shift = 3;
+ }
+
+ c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
+ c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
+
+ /*
+ * Initialize node length ranges which are mostly needed for node
+ * length validation.
+ */
+ c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ;
+ c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ;
+ c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ;
+ c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ;
+ c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
+ c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ;
+
+ c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ;
+ c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ;
+ c->ranges[UBIFS_ORPH_NODE].min_len =
+ UBIFS_ORPH_NODE_SZ + sizeof(__le64);
+ c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
+ c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
+ c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
+ c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
+ c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
+ c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
+ c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
+ /*
+ * Minimum indexing node size is amended later when superblock is
+ * read and the key length is known.
+ */
+ c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
+ /*
+ * Maximum indexing node size is amended later when superblock is
+ * read and the fanout is known.
+ */
+ c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
+
+ /*
+ * Initialize dead and dark LEB space watermarks.
+ *
+ * Dead space is the space which cannot be used. Its watermark is
+ * equivalent to min. I/O unit or minimum node size if it is greater
+ * then min. I/O unit.
+ *
+ * Dark space is the space which might be used, or might not, depending
+ * on which node should be written to the LEB. Its watermark is
+ * equivalent to maximum UBIFS node size.
+ */
+ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
+ c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+
+ return 0;
+}
+
+/**
+ * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB the write-buffer was synchronized to
+ * @free: how many free bytes left in this LEB
+ * @pad: how many bytes were padded
+ *
+ * This is a callback function which is called by the I/O unit when the
+ * write-buffer is synchronized. We need this to correctly maintain space
+ * accounting in bud logical eraseblocks. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ *
+ * This function actually belongs to the journal, but we keep it here because
+ * we want to keep it static.
+ */
+static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
+{
+ return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
+}
+
+/*
+ * init_constants_late - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the superblock has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int init_constants_late(struct ubifs_info *c)
+{
+ int tmp, err;
+ uint64_t tmp64;
+
+ c->main_bytes = (long long)c->main_lebs * c->leb_size;
+ c->max_znode_sz = sizeof(struct ubifs_znode) +
+ c->fanout * sizeof(struct ubifs_zbranch);
+
+ tmp = ubifs_idx_node_sz(c, 1);
+ c->ranges[UBIFS_IDX_NODE].min_len = tmp;
+ c->min_idx_node_sz = ALIGN(tmp, 8);
+
+ tmp = ubifs_idx_node_sz(c, c->fanout);
+ c->ranges[UBIFS_IDX_NODE].max_len = tmp;
+ c->max_idx_node_sz = ALIGN(tmp, 8);
+
+ /* Make sure LEB size is large enough to fit full commit */
+ tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
+ tmp = ALIGN(tmp, c->min_io_size);
+ if (tmp > c->leb_size) {
+ dbg_err("too small LEB size %d, at least %d needed",
+ c->leb_size, tmp);
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure that the log is large enough to fit reference nodes for
+ * all buds plus one reserved LEB.
+ */
+ tmp64 = c->max_bud_bytes;
+ tmp = do_div(tmp64, c->leb_size);
+ c->max_bud_cnt = tmp64 + !!tmp;
+ tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
+ tmp /= c->leb_size;
+ tmp += 1;
+ if (c->log_lebs < tmp) {
+ dbg_err("too small log %d LEBs, required min. %d LEBs",
+ c->log_lebs, tmp);
+ return -EINVAL;
+ }
+
+ /*
+ * When budgeting we assume worst-case scenarios when the pages are not
+ * be compressed and direntries are of the maximum size.
+ *
+ * Note, data, which may be stored in inodes is budgeted separately, so
+ * it is not included into 'c->inode_budget'.
+ */
+ c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+ c->inode_budget = UBIFS_INO_NODE_SZ;
+ c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+
+ /*
+ * When the amount of flash space used by buds becomes
+ * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
+ * The writers are unblocked when the commit is finished. To avoid
+ * writers to be blocked UBIFS initiates background commit in advance,
+ * when number of bud bytes becomes above the limit defined below.
+ */
+ c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
+
+ /*
+ * Ensure minimum journal size. All the bytes in the journal heads are
+ * considered to be used, when calculating the current journal usage.
+ * Consequently, if the journal is too small, UBIFS will treat it as
+ * always full.
+ */
+ tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+ if (c->bg_bud_bytes < tmp64)
+ c->bg_bud_bytes = tmp64;
+ if (c->max_bud_bytes < tmp64 + c->leb_size)
+ c->max_bud_bytes = tmp64 + c->leb_size;
+
+ err = ubifs_calc_lpt_geom(c);
+ if (err)
+ return err;
+
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ /*
+ * Calculate total amount of FS blocks. This number is not used
+ * internally because it does not make much sense for UBIFS, but it is
+ * necessary to report something for the 'statfs()' call.
+ *
+ * Subtract the LEB reserved for GC and the LEB which is reserved for
+ * deletions.
+ *
+ * Review 'ubifs_calc_available()' if changing this calculation.
+ */
+ tmp64 = c->main_lebs - 2;
+ tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+ tmp64 = ubifs_reported_space(c, tmp64);
+ c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
+
+ return 0;
+}
+
+/**
+ * take_gc_lnum - reserve GC LEB.
+ * @c: UBIFS file-system description object
+ *
+ * This function ensures that the LEB reserved for garbage collection is
+ * unmapped and is marked as "taken" in lprops. We also have to set free space
+ * to LEB size and dirty space to zero, because lprops may contain out-of-date
+ * information if the file-system was un-mounted before it has been committed.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int take_gc_lnum(struct ubifs_info *c)
+{
+ int err;
+
+ if (c->gc_lnum == -1) {
+ ubifs_err("no LEB for GC");
+ return -EINVAL;
+ }
+
+ err = ubifs_leb_unmap(c, c->gc_lnum);
+ if (err)
+ return err;
+
+ /* And we have to tell lprops that this LEB is taken */
+ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
+ LPROPS_TAKEN, 0, 0);
+ return err;
+}
+
+/**
+ * alloc_wbufs - allocate write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This helper function allocates and initializes UBIFS write-buffers. Returns
+ * zero in case of success and %-ENOMEM in case of failure.
+ */
+static int alloc_wbufs(struct ubifs_info *c)
+{
+ int i, err;
+
+ c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
+ GFP_KERNEL);
+ if (!c->jheads)
+ return -ENOMEM;
+
+ /* Initialize journal heads */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ INIT_LIST_HEAD(&c->jheads[i].buds_list);
+ err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
+ if (err)
+ return err;
+
+ c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
+ c->jheads[i].wbuf.jhead = i;
+ }
+
+ c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
+ /*
+ * Garbage Collector head likely contains long-term data and
+ * does not need to be synchronized by timer.
+ */
+ c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
+ c->jheads[GCHD].wbuf.timeout = 0;
+
+ return 0;
+}
+
+/**
+ * free_wbufs - free write-buffers.
+ * @c: UBIFS file-system description object
+ */
+static void free_wbufs(struct ubifs_info *c)
+{
+ int i;
+
+ if (c->jheads) {
+ for (i = 0; i < c->jhead_cnt; i++) {
+ kfree(c->jheads[i].wbuf.buf);
+ kfree(c->jheads[i].wbuf.inodes);
+ }
+ kfree(c->jheads);
+ c->jheads = NULL;
+ }
+}
+
+/**
+ * free_orphans - free orphans.
+ * @c: UBIFS file-system description object
+ */
+static void free_orphans(struct ubifs_info *c)
+{
+ struct ubifs_orphan *orph;
+
+ while (c->orph_dnext) {
+ orph = c->orph_dnext;
+ c->orph_dnext = orph->dnext;
+ list_del(&orph->list);
+ kfree(orph);
+ }
+
+ while (!list_empty(&c->orph_list)) {
+ orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
+ list_del(&orph->list);
+ kfree(orph);
+ dbg_err("orphan list not empty at unmount");
+ }
+
+ vfree(c->orph_buf);
+ c->orph_buf = NULL;
+}
+
+/**
+ * free_buds - free per-bud objects.
+ * @c: UBIFS file-system description object
+ */
+static void free_buds(struct ubifs_info *c)
+{
+ struct rb_node *this = c->buds.rb_node;
+ struct ubifs_bud *bud;
+
+ while (this) {
+ if (this->rb_left)
+ this = this->rb_left;
+ else if (this->rb_right)
+ this = this->rb_right;
+ else {
+ bud = rb_entry(this, struct ubifs_bud, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &bud->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(bud);
+ }
+ }
+}
+
+/**
+ * check_volume_empty - check if the UBI volume is empty.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks if the UBIFS volume is empty by looking if its LEBs are
+ * mapped or not. The result of checking is stored in the @c->empty variable.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int check_volume_empty(struct ubifs_info *c)
+{
+ int lnum, err;
+
+ c->empty = 1;
+ for (lnum = 0; lnum < c->leb_cnt; lnum++) {
+ err = ubi_is_mapped(c->ubi, lnum);
+ if (unlikely(err < 0))
+ return err;
+ if (err == 1) {
+ c->empty = 0;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * UBIFS mount options.
+ *
+ * Opt_fast_unmount: do not run a journal commit before un-mounting
+ * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_err: just end of array marker
+ */
+enum {
+ Opt_fast_unmount,
+ Opt_norm_unmount,
+ Opt_err,
+};
+
+static match_table_t tokens = {
+ {Opt_fast_unmount, "fast_unmount"},
+ {Opt_norm_unmount, "norm_unmount"},
+ {Opt_err, NULL},
+};
+
+/**
+ * ubifs_parse_options - parse mount parameters.
+ * @c: UBIFS file-system description object
+ * @options: parameters to parse
+ * @is_remount: non-zero if this is FS re-mount
+ *
+ * This function parses UBIFS mount options and returns zero in case success
+ * and a negative error code in case of failure.
+ */
+static int ubifs_parse_options(struct ubifs_info *c, char *options,
+ int is_remount)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ","))) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_fast_unmount:
+ c->mount_opts.unmount_mode = 2;
+ c->fast_unmount = 1;
+ break;
+ case Opt_norm_unmount:
+ c->mount_opts.unmount_mode = 1;
+ c->fast_unmount = 0;
+ break;
+ default:
+ ubifs_err("unrecognized mount option \"%s\" "
+ "or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * destroy_journal - destroy journal data structures.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys journal data structures including those that may have
+ * been created by recovery functions.
+ */
+static void destroy_journal(struct ubifs_info *c)
+{
+ while (!list_empty(&c->unclean_leb_list)) {
+ struct ubifs_unclean_leb *ucleb;
+
+ ucleb = list_entry(c->unclean_leb_list.next,
+ struct ubifs_unclean_leb, list);
+ list_del(&ucleb->list);
+ kfree(ucleb);
+ }
+ while (!list_empty(&c->old_buds)) {
+ struct ubifs_bud *bud;
+
+ bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+ list_del(&bud->list);
+ kfree(bud);
+ }
+ ubifs_destroy_idx_gc(c);
+ ubifs_destroy_size_tree(c);
+ ubifs_tnc_close(c);
+ free_buds(c);
+}
+
+/**
+ * mount_ubifs - mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function mounts UBIFS file system. Returns zero in case of success and
+ * a negative error code in case of failure.
+ *
+ * Note, the function does not de-allocate resources it it fails half way
+ * through, and the caller has to do this instead.
+ */
+static int mount_ubifs(struct ubifs_info *c)
+{
+ struct super_block *sb = c->vfs_sb;
+ int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
+ long long x;
+ size_t sz;
+
+ err = init_constants_early(c);
+ if (err)
+ return err;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ c->dbg_buf = vmalloc(c->leb_size);
+ if (!c->dbg_buf)
+ return -ENOMEM;
+#endif
+
+ err = check_volume_empty(c);
+ if (err)
+ goto out_free;
+
+ if (c->empty && (mounted_read_only || c->ro_media)) {
+ /*
+ * This UBI volume is empty, and read-only, or the file system
+ * is mounted read-only - we cannot format it.
+ */
+ ubifs_err("can't format empty UBI volume: read-only %s",
+ c->ro_media ? "UBI volume" : "mount");
+ err = -EROFS;
+ goto out_free;
+ }
+
+ if (c->ro_media && !mounted_read_only) {
+ ubifs_err("cannot mount read-write - read-only media");
+ err = -EROFS;
+ goto out_free;
+ }
+
+ /*
+ * The requirement for the buffer is that it should fit indexing B-tree
+ * height amount of integers. We assume the height if the TNC tree will
+ * never exceed 64.
+ */
+ err = -ENOMEM;
+ c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
+ if (!c->bottom_up_buf)
+ goto out_free;
+
+ c->sbuf = vmalloc(c->leb_size);
+ if (!c->sbuf)
+ goto out_free;
+
+ if (!mounted_read_only) {
+ c->ileb_buf = vmalloc(c->leb_size);
+ if (!c->ileb_buf)
+ goto out_free;
+ }
+
+ err = ubifs_read_superblock(c);
+ if (err)
+ goto out_free;
+
+ /*
+ * Make sure the compressor which is set as the default on in the
+ * superblock was actually compiled in.
+ */
+ if (!ubifs_compr_present(c->default_compr)) {
+ ubifs_warn("'%s' compressor is set by superblock, but not "
+ "compiled in", ubifs_compr_name(c->default_compr));
+ c->default_compr = UBIFS_COMPR_NONE;
+ }
+
+ dbg_failure_mode_registration(c);
+
+ err = init_constants_late(c);
+ if (err)
+ goto out_dereg;
+
+ sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
+ sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+ c->cbuf = kmalloc(sz, GFP_NOFS);
+ if (!c->cbuf) {
+ err = -ENOMEM;
+ goto out_dereg;
+ }
+
+ if (!mounted_read_only) {
+ err = alloc_wbufs(c);
+ if (err)
+ goto out_cbuf;
+
+ /* Create background thread */
+ sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
+ c->vi.vol_id);
+ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+ if (!c->bgt)
+ c->bgt = ERR_PTR(-EINVAL);
+ if (IS_ERR(c->bgt)) {
+ err = PTR_ERR(c->bgt);
+ c->bgt = NULL;
+ ubifs_err("cannot spawn \"%s\", error %d",
+ c->bgt_name, err);
+ goto out_wbufs;
+ }
+ wake_up_process(c->bgt);
+ }
+
+ err = ubifs_read_master(c);
+ if (err)
+ goto out_master;
+
+ if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
+ ubifs_msg("recovery needed");
+ c->need_recovery = 1;
+ if (!mounted_read_only) {
+ err = ubifs_recover_inl_heads(c, c->sbuf);
+ if (err)
+ goto out_master;
+ }
+ } else if (!mounted_read_only) {
+ /*
+ * Set the "dirty" flag so that if we reboot uncleanly we
+ * will notice this immediately on the next mount.
+ */
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ err = ubifs_write_master(c);
+ if (err)
+ goto out_master;
+ }
+
+ err = ubifs_lpt_init(c, 1, !mounted_read_only);
+ if (err)
+ goto out_lpt;
+
+ err = dbg_check_idx_size(c, c->old_idx_sz);
+ if (err)
+ goto out_lpt;
+
+ err = ubifs_replay_journal(c);
+ if (err)
+ goto out_journal;
+
+ err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+ if (err)
+ goto out_orphans;
+
+ if (!mounted_read_only) {
+ int lnum;
+
+ /* Check for enough free space */
+ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+ ubifs_err("insufficient available space");
+ err = -EINVAL;
+ goto out_orphans;
+ }
+
+ /* Check for enough log space */
+ lnum = c->lhead_lnum + 1;
+ if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+ lnum = UBIFS_LOG_LNUM;
+ if (lnum == c->ltail_lnum) {
+ err = ubifs_consolidate_log(c);
+ if (err)
+ goto out_orphans;
+ }
+
+ if (c->need_recovery) {
+ err = ubifs_recover_size(c);
+ if (err)
+ goto out_orphans;
+ err = ubifs_rcvry_gc_commit(c);
+ } else
+ err = take_gc_lnum(c);
+ if (err)
+ goto out_orphans;
+
+ err = dbg_check_lprops(c);
+ if (err)
+ goto out_orphans;
+ } else if (c->need_recovery) {
+ err = ubifs_recover_size(c);
+ if (err)
+ goto out_orphans;
+ }
+
+ spin_lock(&ubifs_infos_lock);
+ list_add_tail(&c->infos_list, &ubifs_infos);
+ spin_unlock(&ubifs_infos_lock);
+
+ if (c->need_recovery) {
+ if (mounted_read_only)
+ ubifs_msg("recovery deferred");
+ else {
+ c->need_recovery = 0;
+ ubifs_msg("recovery completed");
+ }
+ }
+
+ err = dbg_check_filesystem(c);
+ if (err)
+ goto out_infos;
+
+ ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
+ c->vi.vol_id);
+ if (mounted_read_only)
+ ubifs_msg("mounted read-only");
+ x = (long long)c->main_lebs * c->leb_size;
+ ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+ x, x >> 10, x >> 20, c->main_lebs);
+ x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+ ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+ x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+ ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+ ubifs_msg("media format %d, latest format %d",
+ c->fmt_version, UBIFS_FORMAT_VERSION);
+
+ dbg_msg("compiled on: " __DATE__ " at " __TIME__);
+ dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
+ dbg_msg("LEB size: %d bytes (%d KiB)",
+ c->leb_size, c->leb_size / 1024);
+ dbg_msg("data journal heads: %d",
+ c->jhead_cnt - NONDATA_JHEADS_CNT);
+ dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"
+ "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
+ c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
+ c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
+ c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
+ c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
+ dbg_msg("fast unmount: %d", c->fast_unmount);
+ dbg_msg("big_lpt %d", c->big_lpt);
+ dbg_msg("log LEBs: %d (%d - %d)",
+ c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
+ dbg_msg("LPT area LEBs: %d (%d - %d)",
+ c->lpt_lebs, c->lpt_first, c->lpt_last);
+ dbg_msg("orphan area LEBs: %d (%d - %d)",
+ c->orph_lebs, c->orph_first, c->orph_last);
+ dbg_msg("main area LEBs: %d (%d - %d)",
+ c->main_lebs, c->main_first, c->leb_cnt - 1);
+ dbg_msg("index LEBs: %d", c->lst.idx_lebs);
+ dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
+ c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+ dbg_msg("key hash type: %d", c->key_hash_type);
+ dbg_msg("tree fanout: %d", c->fanout);
+ dbg_msg("reserved GC LEB: %d", c->gc_lnum);
+ dbg_msg("first main LEB: %d", c->main_first);
+ dbg_msg("dead watermark: %d", c->dead_wm);
+ dbg_msg("dark watermark: %d", c->dark_wm);
+ x = (long long)c->main_lebs * c->dark_wm;
+ dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
+ x, x >> 10, x >> 20);
+ dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
+ c->max_bud_bytes, c->max_bud_bytes >> 10,
+ c->max_bud_bytes >> 20);
+ dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+ c->bg_bud_bytes, c->bg_bud_bytes >> 10,
+ c->bg_bud_bytes >> 20);
+ dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)",
+ c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
+ dbg_msg("max. seq. number: %llu", c->max_sqnum);
+ dbg_msg("commit number: %llu", c->cmt_no);
+
+ return 0;
+
+out_infos:
+ spin_lock(&ubifs_infos_lock);
+ list_del(&c->infos_list);
+ spin_unlock(&ubifs_infos_lock);
+out_orphans:
+ free_orphans(c);
+out_journal:
+ destroy_journal(c);
+out_lpt:
+ ubifs_lpt_free(c, 0);
+out_master:
+ kfree(c->mst_node);
+ kfree(c->rcvrd_mst_node);
+ if (c->bgt)
+ kthread_stop(c->bgt);
+out_wbufs:
+ free_wbufs(c);
+out_cbuf:
+ kfree(c->cbuf);
+out_dereg:
+ dbg_failure_mode_deregistration(c);
+out_free:
+ vfree(c->ileb_buf);
+ vfree(c->sbuf);
+ kfree(c->bottom_up_buf);
+ UBIFS_DBG(vfree(c->dbg_buf));
+ return err;
+}
+
+/**
+ * ubifs_umount - un-mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * Note, this function is called to free allocated resourced when un-mounting,
+ * as well as free resources when an error occurred while we were half way
+ * through mounting (error path cleanup function). So it has to make sure the
+ * resource was actually allocated before freeing it.
+ */
+static void ubifs_umount(struct ubifs_info *c)
+{
+ dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
+ c->vi.vol_id);
+
+ spin_lock(&ubifs_infos_lock);
+ list_del(&c->infos_list);
+ spin_unlock(&ubifs_infos_lock);
+
+ if (c->bgt)
+ kthread_stop(c->bgt);
+
+ destroy_journal(c);
+ free_wbufs(c);
+ free_orphans(c);
+ ubifs_lpt_free(c, 0);
+
+ kfree(c->cbuf);
+ kfree(c->rcvrd_mst_node);
+ kfree(c->mst_node);
+ vfree(c->sbuf);
+ kfree(c->bottom_up_buf);
+ UBIFS_DBG(vfree(c->dbg_buf));
+ vfree(c->ileb_buf);
+ dbg_failure_mode_deregistration(c);
+}
+
+/**
+ * ubifs_remount_rw - re-mount in read-write mode.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS avoids allocating many unnecessary resources when mounted in read-only
+ * mode. This function allocates the needed resources and re-mounts UBIFS in
+ * read-write mode.
+ */
+static int ubifs_remount_rw(struct ubifs_info *c)
+{
+ int err, lnum;
+
+ if (c->ro_media)
+ return -EINVAL;
+
+ mutex_lock(&c->umount_mutex);
+ c->remounting_rw = 1;
+
+ /* Check for enough free space */
+ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+ ubifs_err("insufficient available space");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (c->old_leb_cnt != c->leb_cnt) {
+ struct ubifs_sb_node *sup;
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup)) {
+ err = PTR_ERR(sup);
+ goto out;
+ }
+ sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+ err = ubifs_write_sb_node(c, sup);
+ if (err)
+ goto out;
+ }
+
+ if (c->need_recovery) {
+ ubifs_msg("completing deferred recovery");
+ err = ubifs_write_rcvrd_mst_node(c);
+ if (err)
+ goto out;
+ err = ubifs_recover_size(c);
+ if (err)
+ goto out;
+ err = ubifs_clean_lebs(c, c->sbuf);
+ if (err)
+ goto out;
+ err = ubifs_recover_inl_heads(c, c->sbuf);
+ if (err)
+ goto out;
+ }
+
+ if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ err = ubifs_write_master(c);
+ if (err)
+ goto out;
+ }
+
+ c->ileb_buf = vmalloc(c->leb_size);
+ if (!c->ileb_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ubifs_lpt_init(c, 0, 1);
+ if (err)
+ goto out;
+
+ err = alloc_wbufs(c);
+ if (err)
+ goto out;
+
+ ubifs_create_buds_lists(c);
+
+ /* Create background thread */
+ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+ if (!c->bgt)
+ c->bgt = ERR_PTR(-EINVAL);
+ if (IS_ERR(c->bgt)) {
+ err = PTR_ERR(c->bgt);
+ c->bgt = NULL;
+ ubifs_err("cannot spawn \"%s\", error %d",
+ c->bgt_name, err);
+ return err;
+ }
+ wake_up_process(c->bgt);
+
+ c->orph_buf = vmalloc(c->leb_size);
+ if (!c->orph_buf)
+ return -ENOMEM;
+
+ /* Check for enough log space */
+ lnum = c->lhead_lnum + 1;
+ if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+ lnum = UBIFS_LOG_LNUM;
+ if (lnum == c->ltail_lnum) {
+ err = ubifs_consolidate_log(c);
+ if (err)
+ goto out;
+ }
+
+ if (c->need_recovery)
+ err = ubifs_rcvry_gc_commit(c);
+ else
+ err = take_gc_lnum(c);
+ if (err)
+ goto out;
+
+ if (c->need_recovery) {
+ c->need_recovery = 0;
+ ubifs_msg("deferred recovery completed");
+ }
+
+ dbg_gen("re-mounted read-write");
+ c->vfs_sb->s_flags &= ~MS_RDONLY;
+ c->remounting_rw = 0;
+ mutex_unlock(&c->umount_mutex);
+ return 0;
+
+out:
+ vfree(c->orph_buf);
+ c->orph_buf = NULL;
+ if (c->bgt) {
+ kthread_stop(c->bgt);
+ c->bgt = NULL;
+ }
+ free_wbufs(c);
+ vfree(c->ileb_buf);
+ c->ileb_buf = NULL;
+ ubifs_lpt_free(c, 1);
+ c->remounting_rw = 0;
+ mutex_unlock(&c->umount_mutex);
+ return err;
+}
+
+/**
+ * commit_on_unmount - commit the journal when un-mounting.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called during un-mounting and it commits the journal unless
+ * the "fast unmount" mode is enabled. It also avoids committing the journal if
+ * it contains too few data.
+ *
+ * Sometimes recovery requires the journal to be committed at least once, and
+ * this function takes care about this.
+ */
+static void commit_on_unmount(struct ubifs_info *c)
+{
+ if (!c->fast_unmount) {
+ long long bud_bytes;
+
+ spin_lock(&c->buds_lock);
+ bud_bytes = c->bud_bytes;
+ spin_unlock(&c->buds_lock);
+ if (bud_bytes > c->leb_size)
+ ubifs_run_commit(c);
+ }
+}
+
+/**
+ * ubifs_remount_ro - re-mount in read-only mode.
+ * @c: UBIFS file-system description object
+ *
+ * We rely on VFS to have stopped writing. Possibly the background thread could
+ * be running a commit, however kthread_stop will wait in that case.
+ */
+static void ubifs_remount_ro(struct ubifs_info *c)
+{
+ int i, err;
+
+ ubifs_assert(!c->need_recovery);
+ commit_on_unmount(c);
+
+ mutex_lock(&c->umount_mutex);
+ if (c->bgt) {
+ kthread_stop(c->bgt);
+ c->bgt = NULL;
+ }
+
+ for (i = 0; i < c->jhead_cnt; i++) {
+ ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ del_timer_sync(&c->jheads[i].wbuf.timer);
+ }
+
+ if (!c->ro_media) {
+ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+ err = ubifs_write_master(c);
+ if (err)
+ ubifs_ro_mode(c, err);
+ }
+
+ ubifs_destroy_idx_gc(c);
+ free_wbufs(c);
+ vfree(c->orph_buf);
+ c->orph_buf = NULL;
+ vfree(c->ileb_buf);
+ c->ileb_buf = NULL;
+ ubifs_lpt_free(c, 1);
+ mutex_unlock(&c->umount_mutex);
+}
+
+static void ubifs_put_super(struct super_block *sb)
+{
+ int i;
+ struct ubifs_info *c = sb->s_fs_info;
+
+ ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
+ c->vi.vol_id);
+ /*
+ * The following asserts are only valid if there has not been a failure
+ * of the media. For example, there will be dirty inodes if we failed
+ * to write them back because of I/O errors.
+ */
+ ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+ ubifs_assert(c->budg_idx_growth == 0);
+ ubifs_assert(c->budg_data_growth == 0);
+
+ /*
+ * The 'c->umount_lock' prevents races between UBIFS memory shrinker
+ * and file system un-mount. Namely, it prevents the shrinker from
+ * picking this superblock for shrinking - it will be just skipped if
+ * the mutex is locked.
+ */
+ mutex_lock(&c->umount_mutex);
+ if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+ /*
+ * First of all kill the background thread to make sure it does
+ * not interfere with un-mounting and freeing resources.
+ */
+ if (c->bgt) {
+ kthread_stop(c->bgt);
+ c->bgt = NULL;
+ }
+
+ /* Synchronize write-buffers */
+ if (c->jheads)
+ for (i = 0; i < c->jhead_cnt; i++) {
+ ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ del_timer_sync(&c->jheads[i].wbuf.timer);
+ }
+
+ /*
+ * On fatal errors c->ro_media is set to 1, in which case we do
+ * not write the master node.
+ */
+ if (!c->ro_media) {
+ /*
+ * We are being cleanly unmounted which means the
+ * orphans were killed - indicate this in the master
+ * node. Also save the reserved GC LEB number.
+ */
+ int err;
+
+ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+ err = ubifs_write_master(c);
+ if (err)
+ /*
+ * Recovery will attempt to fix the master area
+ * next mount, so we just print a message and
+ * continue to unmount normally.
+ */
+ ubifs_err("failed to write master node, "
+ "error %d", err);
+ }
+ }
+
+ ubifs_umount(c);
+ bdi_destroy(&c->bdi);
+ ubi_close_volume(c->ubi);
+ mutex_unlock(&c->umount_mutex);
+ kfree(c);
+}
+
+static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct ubifs_info *c = sb->s_fs_info;
+
+ dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+
+ err = ubifs_parse_options(c, data, 1);
+ if (err) {
+ ubifs_err("invalid or unknown remount parameter");
+ return err;
+ }
+ if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+ err = ubifs_remount_rw(c);
+ if (err)
+ return err;
+ } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+ ubifs_remount_ro(c);
+
+ return 0;
+}
+
+struct super_operations ubifs_super_operations = {
+ .alloc_inode = ubifs_alloc_inode,
+ .destroy_inode = ubifs_destroy_inode,
+ .put_super = ubifs_put_super,
+ .write_inode = ubifs_write_inode,
+ .delete_inode = ubifs_delete_inode,
+ .statfs = ubifs_statfs,
+ .dirty_inode = ubifs_dirty_inode,
+ .remount_fs = ubifs_remount_fs,
+ .show_options = ubifs_show_options,
+ .sync_fs = ubifs_sync_fs,
+};
+
+/**
+ * open_ubi - parse UBI device name string and open the UBI device.
+ * @name: UBI volume name
+ * @mode: UBI volume open mode
+ *
+ * There are several ways to specify UBI volumes when mounting UBIFS:
+ * o ubiX_Y - UBI device number X, volume Y;
+ * o ubiY - UBI device number 0, volume Y;
+ * o ubiX:NAME - mount UBI device X, volume with name NAME;
+ * o ubi:NAME - mount UBI device 0, volume with name NAME.
+ *
+ * Alternative '!' separator may be used instead of ':' (because some shells
+ * like busybox may interpret ':' as an NFS host name separator). This function
+ * returns ubi volume object in case of success and a negative error code in
+ * case of failure.
+ */
+static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+{
+ int dev, vol;
+ char *endptr;
+
+ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
+ return ERR_PTR(-EINVAL);
+
+ /* ubi:NAME method */
+ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
+ return ubi_open_volume_nm(0, name + 4, mode);
+
+ if (!isdigit(name[3]))
+ return ERR_PTR(-EINVAL);
+
+ dev = simple_strtoul(name + 3, &endptr, 0);
+
+ /* ubiY method */
+ if (*endptr == '\0')
+ return ubi_open_volume(0, dev, mode);
+
+ /* ubiX_Y method */
+ if (*endptr == '_' && isdigit(endptr[1])) {
+ vol = simple_strtoul(endptr + 1, &endptr, 0);
+ if (*endptr != '\0')
+ return ERR_PTR(-EINVAL);
+ return ubi_open_volume(dev, vol, mode);
+ }
+
+ /* ubiX:NAME method */
+ if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
+ return ubi_open_volume_nm(dev, ++endptr, mode);
+
+ return ERR_PTR(-EINVAL);
+}
+
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct ubi_volume_desc *ubi = sb->s_fs_info;
+ struct ubifs_info *c;
+ struct inode *root;
+ int err;
+
+ c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+
+ spin_lock_init(&c->cnt_lock);
+ spin_lock_init(&c->cs_lock);
+ spin_lock_init(&c->buds_lock);
+ spin_lock_init(&c->space_lock);
+ spin_lock_init(&c->orphan_lock);
+ init_rwsem(&c->commit_sem);
+ mutex_init(&c->lp_mutex);
+ mutex_init(&c->tnc_mutex);
+ mutex_init(&c->log_mutex);
+ mutex_init(&c->mst_mutex);
+ mutex_init(&c->umount_mutex);
+ init_waitqueue_head(&c->cmt_wq);
+ c->buds = RB_ROOT;
+ c->old_idx = RB_ROOT;
+ c->size_tree = RB_ROOT;
+ c->orph_tree = RB_ROOT;
+ INIT_LIST_HEAD(&c->infos_list);
+ INIT_LIST_HEAD(&c->idx_gc);
+ INIT_LIST_HEAD(&c->replay_list);
+ INIT_LIST_HEAD(&c->replay_buds);
+ INIT_LIST_HEAD(&c->uncat_list);
+ INIT_LIST_HEAD(&c->empty_list);
+ INIT_LIST_HEAD(&c->freeable_list);
+ INIT_LIST_HEAD(&c->frdi_idx_list);
+ INIT_LIST_HEAD(&c->unclean_leb_list);
+ INIT_LIST_HEAD(&c->old_buds);
+ INIT_LIST_HEAD(&c->orph_list);
+ INIT_LIST_HEAD(&c->orph_new);
+
+ c->highest_inum = UBIFS_FIRST_INO;
+ get_random_bytes(&c->vfs_gen, sizeof(int));
+ c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+
+ ubi_get_volume_info(ubi, &c->vi);
+ ubi_get_device_info(c->vi.ubi_num, &c->di);
+
+ /* Re-open the UBI device in read-write mode */
+ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
+ if (IS_ERR(c->ubi)) {
+ err = PTR_ERR(c->ubi);
+ goto out_free;
+ }
+
+ /*
+ * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+ * UBIFS, I/O is not deferred, it is done immediately in readpage,
+ * which means the user would have to wait not just for their own I/O
+ * but the readahead I/O as well i.e. completely pointless.
+ *
+ * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+ */
+ c->bdi.capabilities = BDI_CAP_MAP_COPY;
+ c->bdi.unplug_io_fn = default_unplug_io_fn;
+ err = bdi_init(&c->bdi);
+ if (err)
+ goto out_close;
+
+ err = ubifs_parse_options(c, data, 0);
+ if (err)
+ goto out_bdi;
+
+ c->vfs_sb = sb;
+
+ sb->s_fs_info = c;
+ sb->s_magic = UBIFS_SUPER_MAGIC;
+ sb->s_blocksize = UBIFS_BLOCK_SIZE;
+ sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
+ sb->s_dev = c->vi.cdev;
+ sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
+ if (c->max_inode_sz > MAX_LFS_FILESIZE)
+ sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
+ sb->s_op = &ubifs_super_operations;
+
+ mutex_lock(&c->umount_mutex);
+ err = mount_ubifs(c);
+ if (err) {
+ ubifs_assert(err < 0);
+ goto out_unlock;
+ }
+
+ /* Read the root inode */
+ root = ubifs_iget(sb, UBIFS_ROOT_INO);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out_umount;
+ }
+
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root)
+ goto out_iput;
+
+ mutex_unlock(&c->umount_mutex);
+
+ return 0;
+
+out_iput:
+ iput(root);
+out_umount:
+ ubifs_umount(c);
+out_unlock:
+ mutex_unlock(&c->umount_mutex);
+out_bdi:
+ bdi_destroy(&c->bdi);
+out_close:
+ ubi_close_volume(c->ubi);
+out_free:
+ kfree(c);
+ return err;
+}
+
+static int sb_test(struct super_block *sb, void *data)
+{
+ dev_t *dev = data;
+
+ return sb->s_dev == *dev;
+}
+
+static int sb_set(struct super_block *sb, void *data)
+{
+ dev_t *dev = data;
+
+ sb->s_dev = *dev;
+ return 0;
+}
+
+static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *name, void *data, struct vfsmount *mnt)
+{
+ struct ubi_volume_desc *ubi;
+ struct ubi_volume_info vi;
+ struct super_block *sb;
+ int err;
+
+ dbg_gen("name %s, flags %#x", name, flags);
+
+ /*
+ * Get UBI device number and volume ID. Mount it read-only so far
+ * because this might be a new mount point, and UBI allows only one
+ * read-write user at a time.
+ */
+ ubi = open_ubi(name, UBI_READONLY);
+ if (IS_ERR(ubi)) {
+ ubifs_err("cannot open \"%s\", error %d",
+ name, (int)PTR_ERR(ubi));
+ return PTR_ERR(ubi);
+ }
+ ubi_get_volume_info(ubi, &vi);
+
+ dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
+
+ sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);
+ goto out_close;
+ }
+
+ if (sb->s_root) {
+ /* A new mount point for already mounted UBIFS */
+ dbg_gen("this ubi volume is already mounted");
+ if ((flags ^ sb->s_flags) & MS_RDONLY) {
+ err = -EBUSY;
+ goto out_deact;
+ }
+ } else {
+ sb->s_flags = flags;
+ /*
+ * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
+ * replaced by 'c'.
+ */
+ sb->s_fs_info = ubi;
+ err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+ if (err)
+ goto out_deact;
+ /* We do not support atime */
+ sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+ }
+
+ /* 'fill_super()' opens ubi again so we must close it here */
+ ubi_close_volume(ubi);
+
+ return simple_set_mnt(mnt, sb);
+
+out_deact:
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+out_close:
+ ubi_close_volume(ubi);
+ return err;
+}
+
+static void ubifs_kill_sb(struct super_block *sb)
+{
+ struct ubifs_info *c = sb->s_fs_info;
+
+ /*
+ * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
+ * in order to be outside BKL.
+ */
+ if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+ commit_on_unmount(c);
+ /* The un-mount routine is actually done in put_super() */
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type ubifs_fs_type = {
+ .name = "ubifs",
+ .owner = THIS_MODULE,
+ .get_sb = ubifs_get_sb,
+ .kill_sb = ubifs_kill_sb
+};
+
+/*
+ * Inode slab cache constructor.
+ */
+static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+{
+ struct ubifs_inode *ui = obj;
+ inode_init_once(&ui->vfs_inode);
+}
+
+static int __init ubifs_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
+
+ /* Make sure node sizes are 8-byte aligned */
+ BUILD_BUG_ON(UBIFS_CH_SZ & 7);
+ BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
+
+ BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7);
+ BUILD_BUG_ON(MIN_WRITE_SZ & 7);
+
+ /* Check min. node size */
+ BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ);
+ BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
+ BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
+ BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
+
+ BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+ BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+ BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
+ BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ);
+
+ /* Defined node sizes */
+ BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096);
+ BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
+ BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
+ BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
+
+ /*
+ * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+ * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
+ */
+ if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+ ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
+ " at least 4096 bytes",
+ (unsigned int)PAGE_CACHE_SIZE);
+ return -EINVAL;
+ }
+
+ err = register_filesystem(&ubifs_fs_type);
+ if (err) {
+ ubifs_err("cannot register file system, error %d", err);
+ return err;
+ }
+
+ err = -ENOMEM;
+ ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
+ sizeof(struct ubifs_inode), 0,
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
+ &inode_slab_ctor);
+ if (!ubifs_inode_slab)
+ goto out_reg;
+
+ register_shrinker(&ubifs_shrinker_info);
+
+ err = ubifs_compressors_init();
+ if (err)
+ goto out_compr;
+
+ return 0;
+
+out_compr:
+ unregister_shrinker(&ubifs_shrinker_info);
+ kmem_cache_destroy(ubifs_inode_slab);
+out_reg:
+ unregister_filesystem(&ubifs_fs_type);
+ return err;
+}
+/* late_initcall to let compressors initialize first */
+late_initcall(ubifs_init);
+
+static void __exit ubifs_exit(void)
+{
+ ubifs_assert(list_empty(&ubifs_infos));
+ ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+
+ ubifs_compressors_exit();
+ unregister_shrinker(&ubifs_shrinker_info);
+ kmem_cache_destroy(ubifs_inode_slab);
+ unregister_filesystem(&ubifs_fs_type);
+}
+module_exit(ubifs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(__stringify(UBIFS_VERSION));
+MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
+MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 00000000000..e909f4a9644
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,2956 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements TNC (Tree Node Cache) which caches indexing nodes of
+ * the UBIFS B-tree.
+ *
+ * At the moment the locking rules of the TNC tree are quite simple and
+ * straightforward. We just have a mutex and lock it when we traverse the
+ * tree. If a znode is not in memory, we read it from flash while still having
+ * the mutex locked.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/*
+ * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
+ * @NAME_LESS: name corresponding to the first argument is less than second
+ * @NAME_MATCHES: names match
+ * @NAME_GREATER: name corresponding to the second argument is greater than
+ * first
+ * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
+ *
+ * These constants were introduce to improve readability.
+ */
+enum {
+ NAME_LESS = 0,
+ NAME_MATCHES = 1,
+ NAME_GREATER = 2,
+ NOT_ON_MEDIA = 3,
+};
+
+/**
+ * insert_old_idx - record an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ *
+ * For recovery, there must always be a complete intact version of the index on
+ * flash at all times. That is called the "old index". It is the index as at the
+ * time of the last successful commit. Many of the index nodes in the old index
+ * may be dirty, but they must not be erased until the next successful commit
+ * (at which point that index becomes the old index).
+ *
+ * That means that the garbage collection and the in-the-gaps method of
+ * committing must be able to determine if an index node is in the old index.
+ * Most of the old index nodes can be found by looking up the TNC using the
+ * 'lookup_znode()' function. However, some of the old index nodes may have
+ * been deleted from the current index or may have been changed so much that
+ * they cannot be easily found. In those cases, an entry is added to an RB-tree.
+ * That is what this function does. The RB-tree is ordered by LEB number and
+ * offset because they uniquely identify the old index node.
+ */
+static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+ struct ubifs_old_idx *old_idx, *o;
+ struct rb_node **p, *parent = NULL;
+
+ old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+ if (unlikely(!old_idx))
+ return -ENOMEM;
+ old_idx->lnum = lnum;
+ old_idx->offs = offs;
+
+ p = &c->old_idx.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_old_idx, rb);
+ if (lnum < o->lnum)
+ p = &(*p)->rb_left;
+ else if (lnum > o->lnum)
+ p = &(*p)->rb_right;
+ else if (offs < o->offs)
+ p = &(*p)->rb_left;
+ else if (offs > o->offs)
+ p = &(*p)->rb_right;
+ else {
+ ubifs_err("old idx added twice!");
+ kfree(old_idx);
+ return 0;
+ }
+ }
+ rb_link_node(&old_idx->rb, parent, p);
+ rb_insert_color(&old_idx->rb, &c->old_idx);
+ return 0;
+}
+
+/**
+ * insert_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
+{
+ if (znode->parent) {
+ struct ubifs_zbranch *zbr;
+
+ zbr = &znode->parent->zbranch[znode->iip];
+ if (zbr->len)
+ return insert_old_idx(c, zbr->lnum, zbr->offs);
+ } else
+ if (c->zroot.len)
+ return insert_old_idx(c, c->zroot.lnum,
+ c->zroot.offs);
+ return 0;
+}
+
+/**
+ * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+static int ins_clr_old_idx_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ int err;
+
+ if (znode->parent) {
+ struct ubifs_zbranch *zbr;
+
+ zbr = &znode->parent->zbranch[znode->iip];
+ if (zbr->len) {
+ err = insert_old_idx(c, zbr->lnum, zbr->offs);
+ if (err)
+ return err;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+ }
+ } else
+ if (c->zroot.len) {
+ err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
+ if (err)
+ return err;
+ c->zroot.lnum = 0;
+ c->zroot.offs = 0;
+ c->zroot.len = 0;
+ }
+ return 0;
+}
+
+/**
+ * destroy_old_idx - destroy the old_idx RB-tree.
+ * @c: UBIFS file-system description object
+ *
+ * During start commit, the old_idx RB-tree is used to avoid overwriting index
+ * nodes that were in the index last commit but have since been deleted. This
+ * is necessary for recovery i.e. the old index must be kept intact until the
+ * new index is successfully written. The old-idx RB-tree is used for the
+ * in-the-gaps method of writing index nodes and is destroyed every commit.
+ */
+void destroy_old_idx(struct ubifs_info *c)
+{
+ struct rb_node *this = c->old_idx.rb_node;
+ struct ubifs_old_idx *old_idx;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ old_idx = rb_entry(this, struct ubifs_old_idx, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &old_idx->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(old_idx);
+ }
+ c->old_idx = RB_ROOT;
+}
+
+/**
+ * copy_znode - copy a dirty znode.
+ * @c: UBIFS file-system description object
+ * @znode: znode to copy
+ *
+ * A dirty znode being committed may not be changed, so it is copied.
+ */
+static struct ubifs_znode *copy_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zn;
+
+ zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+ if (unlikely(!zn))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(zn, znode, c->max_znode_sz);
+ zn->cnext = NULL;
+ __set_bit(DIRTY_ZNODE, &zn->flags);
+ __clear_bit(COW_ZNODE, &zn->flags);
+
+ ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+ __set_bit(OBSOLETE_ZNODE, &znode->flags);
+
+ if (znode->level != 0) {
+ int i;
+ const int n = zn->child_cnt;
+
+ /* The children now have new parent */
+ for (i = 0; i < n; i++) {
+ struct ubifs_zbranch *zbr = &zn->zbranch[i];
+
+ if (zbr->znode)
+ zbr->znode->parent = zn;
+ }
+ }
+
+ atomic_long_inc(&c->dirty_zn_cnt);
+ return zn;
+}
+
+/**
+ * add_idx_dirt - add dirt due to a dirty znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of index node
+ * @dirt: size of index node
+ *
+ * This function updates lprops dirty space and the new size of the index.
+ */
+static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
+{
+ c->calc_idx_sz -= ALIGN(dirt, 8);
+ return ubifs_add_dirt(c, lnum, dirt);
+}
+
+/**
+ * dirty_cow_znode - ensure a znode is not being committed.
+ * @c: UBIFS file-system description object
+ * @zbr: branch of znode to check
+ *
+ * Returns dirtied znode on success or negative error code on failure.
+ */
+static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr)
+{
+ struct ubifs_znode *znode = zbr->znode;
+ struct ubifs_znode *zn;
+ int err;
+
+ if (!test_bit(COW_ZNODE, &znode->flags)) {
+ /* znode is not being committed */
+ if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
+ atomic_long_inc(&c->dirty_zn_cnt);
+ atomic_long_dec(&c->clean_zn_cnt);
+ atomic_long_dec(&ubifs_clean_zn_cnt);
+ err = add_idx_dirt(c, zbr->lnum, zbr->len);
+ if (unlikely(err))
+ return ERR_PTR(err);
+ }
+ return znode;
+ }
+
+ zn = copy_znode(c, znode);
+ if (unlikely(IS_ERR(zn)))
+ return zn;
+
+ if (zbr->len) {
+ err = insert_old_idx(c, zbr->lnum, zbr->offs);
+ if (unlikely(err))
+ return ERR_PTR(err);
+ err = add_idx_dirt(c, zbr->lnum, zbr->len);
+ } else
+ err = 0;
+
+ zbr->znode = zn;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+
+ if (unlikely(err))
+ return ERR_PTR(err);
+ return zn;
+}
+
+/**
+ * lnc_add - add a leaf node to the leaf node cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
+ * purpose of the leaf node cache is to save re-reading the same leaf node over
+ * and over again. Most things are cached by VFS, however the file system must
+ * cache directory entries for readdir and for resolving hash collisions. The
+ * present implementation of the leaf node cache is extremely simple, and
+ * allows for error returns that are not used but that may be needed if a more
+ * complex implementation is created.
+ *
+ * Note, this function does not add the @node object to LNC directly, but
+ * allocates a copy of the object and adds the copy to LNC. The reason for this
+ * is that @node has been allocated outside of the TNC subsystem and will be
+ * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
+ * may be changed at any time, e.g. freed by the shrinker.
+ */
+static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ const void *node)
+{
+ int err;
+ void *lnc_node;
+ const struct ubifs_dent_node *dent = node;
+
+ ubifs_assert(!zbr->leaf);
+ ubifs_assert(zbr->len != 0);
+ ubifs_assert(is_hash_key(c, &zbr->key));
+
+ err = ubifs_validate_entry(c, dent);
+ if (err) {
+ dbg_dump_stack();
+ dbg_dump_node(c, dent);
+ return err;
+ }
+
+ lnc_node = kmalloc(zbr->len, GFP_NOFS);
+ if (!lnc_node)
+ /* We don't have to have the cache, so no error */
+ return 0;
+
+ memcpy(lnc_node, node, zbr->len);
+ zbr->leaf = lnc_node;
+ return 0;
+}
+
+ /**
+ * lnc_add_directly - add a leaf node to the leaf-node-cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * This function is similar to 'lnc_add()', but it does not create a copy of
+ * @node but inserts @node to TNC directly.
+ */
+static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
+{
+ int err;
+
+ ubifs_assert(!zbr->leaf);
+ ubifs_assert(zbr->len != 0);
+
+ err = ubifs_validate_entry(c, node);
+ if (err) {
+ dbg_dump_stack();
+ dbg_dump_node(c, node);
+ return err;
+ }
+
+ zbr->leaf = node;
+ return 0;
+}
+
+/**
+ * lnc_free - remove a leaf node from the leaf node cache.
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ */
+static void lnc_free(struct ubifs_zbranch *zbr)
+{
+ if (!zbr->leaf)
+ return;
+ kfree(zbr->leaf);
+ zbr->leaf = NULL;
+}
+
+/**
+ * tnc_read_node_nm - read a "hashed" leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a "hashed" node defined by @zbr from the leaf node cache
+ * (in it is there) or from the hash media, in which case the node is also
+ * added to LNC. Returns zero in case of success or a negative negative error
+ * code in case of failure.
+ */
+static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
+{
+ int err;
+
+ ubifs_assert(is_hash_key(c, &zbr->key));
+
+ if (zbr->leaf) {
+ /* Read from the leaf node cache */
+ ubifs_assert(zbr->len != 0);
+ memcpy(node, zbr->leaf, zbr->len);
+ return 0;
+ }
+
+ err = ubifs_tnc_read_node(c, zbr, node);
+ if (err)
+ return err;
+
+ /* Add the node to the leaf node cache */
+ err = lnc_add(c, zbr, node);
+ return err;
+}
+
+/**
+ * try_read_node - read a node if it is a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: LEB number of node to read
+ * @offs: offset of node to read
+ *
+ * This function tries to read a node of known type and length, checks it and
+ * stores it in @buf. This function returns %1 if a node is present and %0 if
+ * a node is not present. A negative error code is returned for I/O errors.
+ * This function performs that same function as ubifs_read_node except that
+ * it does not require that there is actually a node present and instead
+ * the return code indicates if a node was read.
+ */
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+ int len, int lnum, int offs)
+{
+ int err, node_len;
+ struct ubifs_ch *ch = buf;
+ uint32_t crc, node_crc;
+
+ dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ if (err) {
+ ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
+ type, lnum, offs, err);
+ return err;
+ }
+
+ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+ return 0;
+
+ if (ch->node_type != type)
+ return 0;
+
+ node_len = le32_to_cpu(ch->len);
+ if (node_len != len)
+ return 0;
+
+ crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+ node_crc = le32_to_cpu(ch->crc);
+ if (crc != node_crc)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * fallible_read_node - try to read a leaf node.
+ * @c: UBIFS file-system description object
+ * @key: key of node to read
+ * @zbr: position of node
+ * @node: node returned
+ *
+ * This function tries to read a node and returns %1 if the node is read, %0
+ * if the node is not present, and a negative error code in the case of error.
+ */
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_zbranch *zbr, void *node)
+{
+ int ret;
+
+ dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+
+ ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
+ zbr->offs);
+ if (ret == 1) {
+ union ubifs_key node_key;
+ struct ubifs_dent_node *dent = node;
+
+ /* All nodes have key in the same place */
+ key_read(c, &dent->key, &node_key);
+ if (keys_cmp(c, key, &node_key) != 0)
+ ret = 0;
+ }
+ if (ret == 0)
+ dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
+ zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+ return ret;
+}
+
+/**
+ * matches_name - determine if a direntry or xattr entry matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
+ * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
+ * of failure, a negative error code is returned.
+ */
+static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ const struct qstr *nm)
+{
+ struct ubifs_dent_node *dent;
+ int nlen, err;
+
+ /* If possible, match against the dent in the leaf node cache */
+ if (!zbr->leaf) {
+ dent = kmalloc(zbr->len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ err = ubifs_tnc_read_node(c, zbr, dent);
+ if (err)
+ goto out_free;
+
+ /* Add the node to the leaf node cache */
+ err = lnc_add_directly(c, zbr, dent);
+ if (err)
+ goto out_free;
+ } else
+ dent = zbr->leaf;
+
+ nlen = le16_to_cpu(dent->nlen);
+ err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ if (err == 0) {
+ if (nlen == nm->len)
+ return NAME_MATCHES;
+ else if (nlen < nm->len)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+ } else if (err < 0)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+
+out_free:
+ kfree(dent);
+ return err;
+}
+
+/**
+ * get_znode - get a TNC znode that may not be loaded yet.
+ * @c: UBIFS file-system description object
+ * @znode: parent znode
+ * @n: znode branch slot number
+ *
+ * This function returns the znode or a negative error code.
+ */
+static struct ubifs_znode *get_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode, int n)
+{
+ struct ubifs_zbranch *zbr;
+
+ zbr = &znode->zbranch[n];
+ if (zbr->znode)
+ znode = zbr->znode;
+ else
+ znode = ubifs_load_znode(c, zbr, znode, n);
+ return znode;
+}
+
+/**
+ * tnc_next - find next TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is passed and returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
+ * no next entry, or a negative error code otherwise.
+ */
+static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+ struct ubifs_znode *znode = *zn;
+ int nn = *n;
+
+ nn += 1;
+ if (nn < znode->child_cnt) {
+ *n = nn;
+ return 0;
+ }
+ while (1) {
+ struct ubifs_znode *zp;
+
+ zp = znode->parent;
+ if (!zp)
+ return -ENOENT;
+ nn = znode->iip + 1;
+ znode = zp;
+ if (nn < znode->child_cnt) {
+ znode = get_znode(c, znode, nn);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ while (znode->level != 0) {
+ znode = get_znode(c, znode, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+ nn = 0;
+ break;
+ }
+ }
+ *zn = znode;
+ *n = nn;
+ return 0;
+}
+
+/**
+ * tnc_prev - find previous TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the previous TNC entry is found, %-ENOENT if
+ * there is no next entry, or a negative error code otherwise.
+ */
+static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+ struct ubifs_znode *znode = *zn;
+ int nn = *n;
+
+ if (nn > 0) {
+ *n = nn - 1;
+ return 0;
+ }
+ while (1) {
+ struct ubifs_znode *zp;
+
+ zp = znode->parent;
+ if (!zp)
+ return -ENOENT;
+ nn = znode->iip - 1;
+ znode = zp;
+ if (nn >= 0) {
+ znode = get_znode(c, znode, nn);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ while (znode->level != 0) {
+ nn = znode->child_cnt - 1;
+ znode = get_znode(c, znode, nn);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+ nn = znode->child_cnt - 1;
+ break;
+ }
+ }
+ *zn = znode;
+ *n = nn;
+ return 0;
+}
+
+/**
+ * resolve_collision - resolve a collision.
+ * @c: UBIFS file-system description object
+ * @key: key of a directory or extended attribute entry
+ * @zn: znode is returned here
+ * @n: zbranch number is passed and returned here
+ * @nm: name of the entry
+ *
+ * This function is called for "hashed" keys to make sure that the found key
+ * really corresponds to the looked up node (directory or extended attribute
+ * entry). It returns %1 and sets @zn and @n if the collision is resolved.
+ * %0 is returned if @nm is not found and @zn and @n are set to the previous
+ * entry, i.e. to the entry after which @nm could follow if it were in TNC.
+ * This means that @n may be set to %-1 if the leftmost key in @zn is the
+ * previous one. A negative error code is returned on failures.
+ */
+static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n,
+ const struct qstr *nm)
+{
+ int err;
+
+ err = matches_name(c, &(*zn)->zbranch[*n], nm);
+ if (unlikely(err < 0))
+ return err;
+ if (err == NAME_MATCHES)
+ return 1;
+
+ if (err == NAME_GREATER) {
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, zn, n);
+ if (err == -ENOENT) {
+ ubifs_assert(*n == 0);
+ *n = -1;
+ return 0;
+ }
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+ /*
+ * We have found the branch after which we would
+ * like to insert, but inserting in this znode
+ * may still be wrong. Consider the following 3
+ * znodes, in the case where we are resolving a
+ * collision with Key2.
+ *
+ * znode zp
+ * ----------------------
+ * level 1 | Key0 | Key1 |
+ * -----------------------
+ * | |
+ * znode za | | znode zb
+ * ------------ ------------
+ * level 0 | Key0 | | Key2 |
+ * ------------ ------------
+ *
+ * The lookup finds Key2 in znode zb. Lets say
+ * there is no match and the name is greater so
+ * we look left. When we find Key0, we end up
+ * here. If we return now, we will insert into
+ * znode za at slot n = 1. But that is invalid
+ * according to the parent's keys. Key2 must
+ * be inserted into znode zb.
+ *
+ * Note, this problem is not relevant for the
+ * case when we go right, because
+ * 'tnc_insert()' would correct the parent key.
+ */
+ if (*n == (*zn)->child_cnt - 1) {
+ err = tnc_next(c, zn, n);
+ if (err) {
+ /* Should be impossible */
+ ubifs_assert(0);
+ if (err == -ENOENT)
+ err = -EINVAL;
+ return err;
+ }
+ ubifs_assert(*n == 0);
+ *n = -1;
+ }
+ return 0;
+ }
+ err = matches_name(c, &(*zn)->zbranch[*n], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_LESS)
+ return 0;
+ if (err == NAME_MATCHES)
+ return 1;
+ ubifs_assert(err == NAME_GREATER);
+ }
+ } else {
+ int nn = *n;
+ struct ubifs_znode *znode = *zn;
+
+ /* Look right */
+ while (1) {
+ err = tnc_next(c, &znode, &nn);
+ if (err == -ENOENT)
+ return 0;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ return 0;
+ err = matches_name(c, &znode->zbranch[nn], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_GREATER)
+ return 0;
+ *zn = znode;
+ *n = nn;
+ if (err == NAME_MATCHES)
+ return 1;
+ ubifs_assert(err == NAME_LESS);
+ }
+ }
+}
+
+/**
+ * fallible_matches_name - determine if a dent matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This is a "fallible" version of 'matches_name()' function which does not
+ * panic if the direntry/xentry referred by @zbr does not exist on the media.
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
+ * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
+ * if xentry/direntry referred by @zbr does not exist on the media. A negative
+ * error code is returned in case of failure.
+ */
+static int fallible_matches_name(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr,
+ const struct qstr *nm)
+{
+ struct ubifs_dent_node *dent;
+ int nlen, err;
+
+ /* If possible, match against the dent in the leaf node cache */
+ if (!zbr->leaf) {
+ dent = kmalloc(zbr->len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ err = fallible_read_node(c, &zbr->key, zbr, dent);
+ if (err < 0)
+ goto out_free;
+ if (err == 0) {
+ /* The node was not present */
+ err = NOT_ON_MEDIA;
+ goto out_free;
+ }
+ ubifs_assert(err == 1);
+
+ err = lnc_add_directly(c, zbr, dent);
+ if (err)
+ goto out_free;
+ } else
+ dent = zbr->leaf;
+
+ nlen = le16_to_cpu(dent->nlen);
+ err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ if (err == 0) {
+ if (nlen == nm->len)
+ return NAME_MATCHES;
+ else if (nlen < nm->len)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+ } else if (err < 0)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+
+out_free:
+ kfree(dent);
+ return err;
+}
+
+/**
+ * fallible_resolve_collision - resolve a collision even if nodes are missing.
+ * @c: UBIFS file-system description object
+ * @key: key
+ * @zn: znode is returned here
+ * @n: branch number is passed and returned here
+ * @nm: name of directory entry
+ * @adding: indicates caller is adding a key to the TNC
+ *
+ * This is a "fallible" version of the 'resolve_collision()' function which
+ * does not panic if one of the nodes referred to by TNC does not exist on the
+ * media. This may happen when replaying the journal if a deleted node was
+ * Garbage-collected and the commit was not done. A branch that refers to a node
+ * that is not present is called a dangling branch. The following are the return
+ * codes for this function:
+ * o if @nm was found, %1 is returned and @zn and @n are set to the found
+ * branch;
+ * o if we are @adding and @nm was not found, %0 is returned;
+ * o if we are not @adding and @nm was not found, but a dangling branch was
+ * found, then %1 is returned and @zn and @n are set to the dangling branch;
+ * o a negative error code is returned in case of failure.
+ */
+static int fallible_resolve_collision(struct ubifs_info *c,
+ const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n,
+ const struct qstr *nm, int adding)
+{
+ struct ubifs_znode *o_znode = NULL, *znode = *zn;
+ int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+
+ cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
+ if (unlikely(cmp < 0))
+ return cmp;
+ if (cmp == NAME_MATCHES)
+ return 1;
+ if (cmp == NOT_ON_MEDIA) {
+ o_znode = znode;
+ o_n = nn;
+ /*
+ * We are unlucky and hit a dangling branch straight away.
+ * Now we do not really know where to go to find the needed
+ * branch - to the left or to the right. Well, let's try left.
+ */
+ unsure = 1;
+ } else if (!adding)
+ unsure = 1; /* Remove a dangling branch wherever it is */
+
+ if (cmp == NAME_GREATER || unsure) {
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, zn, n);
+ if (err == -ENOENT) {
+ ubifs_assert(*n == 0);
+ *n = -1;
+ break;
+ }
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+ /* See comments in 'resolve_collision()' */
+ if (*n == (*zn)->child_cnt - 1) {
+ err = tnc_next(c, zn, n);
+ if (err) {
+ /* Should be impossible */
+ ubifs_assert(0);
+ if (err == -ENOENT)
+ err = -EINVAL;
+ return err;
+ }
+ ubifs_assert(*n == 0);
+ *n = -1;
+ }
+ break;
+ }
+ err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_MATCHES)
+ return 1;
+ if (err == NOT_ON_MEDIA) {
+ o_znode = *zn;
+ o_n = *n;
+ continue;
+ }
+ if (!adding)
+ continue;
+ if (err == NAME_LESS)
+ break;
+ else
+ unsure = 0;
+ }
+ }
+
+ if (cmp == NAME_LESS || unsure) {
+ /* Look right */
+ *zn = znode;
+ *n = nn;
+ while (1) {
+ err = tnc_next(c, &znode, &nn);
+ if (err == -ENOENT)
+ break;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ break;
+ err = fallible_matches_name(c, &znode->zbranch[nn], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_GREATER)
+ break;
+ *zn = znode;
+ *n = nn;
+ if (err == NAME_MATCHES)
+ return 1;
+ if (err == NOT_ON_MEDIA) {
+ o_znode = znode;
+ o_n = nn;
+ }
+ }
+ }
+
+ /* Never match a dangling branch when adding */
+ if (adding || !o_znode)
+ return 0;
+
+ dbg_mnt("dangling match LEB %d:%d len %d %s",
+ o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
+ o_znode->zbranch[o_n].len, DBGKEY(key));
+ *zn = o_znode;
+ *n = o_n;
+ return 1;
+}
+
+/**
+ * matches_position - determine if a zbranch matches a given position.
+ * @zbr: zbranch of dent
+ * @lnum: LEB number of dent to match
+ * @offs: offset of dent to match
+ *
+ * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
+ */
+static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
+{
+ if (zbr->lnum == lnum && zbr->offs == offs)
+ return 1;
+ else
+ return 0;
+}
+
+/**
+ * resolve_collision_directly - resolve a collision directly.
+ * @c: UBIFS file-system description object
+ * @key: key of directory entry
+ * @zn: znode is passed and returned here
+ * @n: zbranch number is passed and returned here
+ * @lnum: LEB number of dent node to match
+ * @offs: offset of dent node to match
+ *
+ * This function is used for "hashed" keys to make sure the found directory or
+ * extended attribute entry node is what was looked for. It is used when the
+ * flash address of the right node is known (@lnum:@offs) which makes it much
+ * easier to resolve collisions (no need to read entries and match full
+ * names). This function returns %1 and sets @zn and @n if the collision is
+ * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
+ * previous directory entry. Otherwise a negative error code is returned.
+ */
+static int resolve_collision_directly(struct ubifs_info *c,
+ const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode;
+ int nn, err;
+
+ znode = *zn;
+ nn = *n;
+ if (matches_position(&znode->zbranch[nn], lnum, offs))
+ return 1;
+
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, &znode, &nn);
+ if (err == -ENOENT)
+ break;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ break;
+ if (matches_position(&znode->zbranch[nn], lnum, offs)) {
+ *zn = znode;
+ *n = nn;
+ return 1;
+ }
+ }
+
+ /* Look right */
+ znode = *zn;
+ nn = *n;
+ while (1) {
+ err = tnc_next(c, &znode, &nn);
+ if (err == -ENOENT)
+ return 0;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ return 0;
+ *zn = znode;
+ *n = nn;
+ if (matches_position(&znode->zbranch[nn], lnum, offs))
+ return 1;
+ }
+}
+
+/**
+ * dirty_cow_bottom_up - dirty a znode and its ancestors.
+ * @c: UBIFS file-system description object
+ * @znode: znode to dirty
+ *
+ * If we do not have a unique key that resides in a znode, then we cannot
+ * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
+ * This function records the path back to the last dirty ancestor, and then
+ * dirties the znodes on that path.
+ */
+static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zp;
+ int *path = c->bottom_up_buf, p = 0;
+
+ ubifs_assert(c->zroot.znode);
+ ubifs_assert(znode);
+ if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
+ kfree(c->bottom_up_buf);
+ c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
+ GFP_NOFS);
+ if (!c->bottom_up_buf)
+ return ERR_PTR(-ENOMEM);
+ path = c->bottom_up_buf;
+ }
+ if (c->zroot.znode->level) {
+ /* Go up until parent is dirty */
+ while (1) {
+ int n;
+
+ zp = znode->parent;
+ if (!zp)
+ break;
+ n = znode->iip;
+ ubifs_assert(p < c->zroot.znode->level);
+ path[p++] = n;
+ if (!zp->cnext && ubifs_zn_dirty(znode))
+ break;
+ znode = zp;
+ }
+ }
+
+ /* Come back down, dirtying as we go */
+ while (1) {
+ struct ubifs_zbranch *zbr;
+
+ zp = znode->parent;
+ if (zp) {
+ ubifs_assert(path[p - 1] >= 0);
+ ubifs_assert(path[p - 1] < zp->child_cnt);
+ zbr = &zp->zbranch[path[--p]];
+ znode = dirty_cow_znode(c, zbr);
+ } else {
+ ubifs_assert(znode == c->zroot.znode);
+ znode = dirty_cow_znode(c, &c->zroot);
+ }
+ if (unlikely(IS_ERR(znode)) || !p)
+ break;
+ ubifs_assert(path[p - 1] >= 0);
+ ubifs_assert(path[p - 1] < znode->child_cnt);
+ znode = znode->zbranch[path[p - 1]].znode;
+ }
+
+ return znode;
+}
+
+/**
+ * ubifs_lookup_level0 - search for zero-level znode.
+ * @c: UBIFS file-system description object
+ * @key: key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ * o exact match, i.e. the found zero-level znode contains key @key, then %1
+ * is returned and slot number of the matched branch is stored in @n;
+ * o not exact match, which means that zero-level znode does not contain
+ * @key, then %0 is returned and slot number of the closed branch is stored
+ * in @n;
+ * o @key is so small that it is even less than the lowest key of the
+ * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n)
+{
+ int err, exact;
+ struct ubifs_znode *znode;
+ unsigned long time = get_seconds();
+
+ dbg_tnc("search key %s", DBGKEY(key));
+
+ znode = c->zroot.znode;
+ if (unlikely(!znode)) {
+ znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ znode->time = time;
+
+ while (1) {
+ struct ubifs_zbranch *zbr;
+
+ exact = ubifs_search_zbranch(c, znode, key, n);
+
+ if (znode->level == 0)
+ break;
+
+ if (*n < 0)
+ *n = 0;
+ zbr = &znode->zbranch[*n];
+
+ if (zbr->znode) {
+ znode->time = time;
+ znode = zbr->znode;
+ continue;
+ }
+
+ /* znode is not in TNC cache, load it from the media */
+ znode = ubifs_load_znode(c, zbr, znode, *n);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ *zn = znode;
+ if (exact || !is_hash_key(c, key) || *n != -1) {
+ dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+ return exact;
+ }
+
+ /*
+ * Here is a tricky place. We have not found the key and this is a
+ * "hashed" key, which may collide. The rest of the code deals with
+ * situations like this:
+ *
+ * | 3 | 5 |
+ * / \
+ * | 3 | 5 | | 6 | 7 | (x)
+ *
+ * Or more a complex example:
+ *
+ * | 1 | 5 |
+ * / \
+ * | 1 | 3 | | 5 | 8 |
+ * \ /
+ * | 5 | 5 | | 6 | 7 | (x)
+ *
+ * In the examples, if we are looking for key "5", we may reach nodes
+ * marked with "(x)". In this case what we have do is to look at the
+ * left and see if there is "5" key there. If there is, we have to
+ * return it.
+ *
+ * Note, this whole situation is possible because we allow to have
+ * elements which are equivalent to the next key in the parent in the
+ * children of current znode. For example, this happens if we split a
+ * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
+ * like this:
+ * | 3 | 5 |
+ * / \
+ * | 3 | 5 | | 5 | 6 | 7 |
+ * ^
+ * And this becomes what is at the first "picture" after key "5" marked
+ * with "^" is removed. What could be done is we could prohibit
+ * splitting in the middle of the colliding sequence. Also, when
+ * removing the leftmost key, we would have to correct the key of the
+ * parent node, which would introduce additional complications. Namely,
+ * if we changed the the leftmost key of the parent znode, the garbage
+ * collector would be unable to find it (GC is doing this when GC'ing
+ * indexing LEBs). Although we already have an additional RB-tree where
+ * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+ * after the commit. But anyway, this does not look easy to implement
+ * so we did not try this.
+ */
+ err = tnc_prev(c, &znode, n);
+ if (err == -ENOENT) {
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ *n = -1;
+ return 0;
+ }
+ if (unlikely(err < 0))
+ return err;
+ if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ *n = -1;
+ return 0;
+ }
+
+ dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+ *zn = znode;
+ return 1;
+}
+
+/**
+ * lookup_level0_dirty - search for zero-level znode dirtying.
+ * @c: UBIFS file-system description object
+ * @key: key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ * o exact match, i.e. the found zero-level znode contains key @key, then %1
+ * is returned and slot number of the matched branch is stored in @n;
+ * o not exact match, which means that zero-level znode does not contain @key
+ * then %0 is returned and slot number of the closed branch is stored in
+ * @n;
+ * o @key is so small that it is even less than the lowest key of the
+ * leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
+ *
+ * Additionally all znodes in the path from the root to the located zero-level
+ * znode are marked as dirty.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n)
+{
+ int err, exact;
+ struct ubifs_znode *znode;
+ unsigned long time = get_seconds();
+
+ dbg_tnc("search and dirty key %s", DBGKEY(key));
+
+ znode = c->zroot.znode;
+ if (unlikely(!znode)) {
+ znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ znode = dirty_cow_znode(c, &c->zroot);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+
+ znode->time = time;
+
+ while (1) {
+ struct ubifs_zbranch *zbr;
+
+ exact = ubifs_search_zbranch(c, znode, key, n);
+
+ if (znode->level == 0)
+ break;
+
+ if (*n < 0)
+ *n = 0;
+ zbr = &znode->zbranch[*n];
+
+ if (zbr->znode) {
+ znode->time = time;
+ znode = dirty_cow_znode(c, zbr);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ continue;
+ }
+
+ /* znode is not in TNC cache, load it from the media */
+ znode = ubifs_load_znode(c, zbr, znode, *n);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ znode = dirty_cow_znode(c, zbr);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ *zn = znode;
+ if (exact || !is_hash_key(c, key) || *n != -1) {
+ dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+ return exact;
+ }
+
+ /*
+ * See huge comment at 'lookup_level0_dirty()' what is the rest of the
+ * code.
+ */
+ err = tnc_prev(c, &znode, n);
+ if (err == -ENOENT) {
+ *n = -1;
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ return 0;
+ }
+ if (unlikely(err < 0))
+ return err;
+ if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+ *n = -1;
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ return 0;
+ }
+
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+ *zn = znode;
+ return 1;
+}
+
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+ void *node)
+{
+ int found, n, err;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch zbr, *zt;
+
+ mutex_lock(&c->tnc_mutex);
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (!found) {
+ err = -ENOENT;
+ goto out;
+ } else if (found < 0) {
+ err = found;
+ goto out;
+ }
+ zt = &znode->zbranch[n];
+ if (is_hash_key(c, key)) {
+ /*
+ * In this case the leaf node cache gets used, so we pass the
+ * address of the zbranch and keep the mutex locked
+ */
+ err = tnc_read_node_nm(c, zt, node);
+ goto out;
+ }
+ zbr = znode->zbranch[n];
+ mutex_unlock(&c->tnc_mutex);
+
+ err = ubifs_tnc_read_node(c, &zbr, node);
+ return err;
+
+out:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
+ *
+ * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
+ * location also. See 'ubifs_tnc_lookup()'.
+ */
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, int *lnum, int *offs)
+{
+ int found, n, err;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch zbr, *zt;
+
+ mutex_lock(&c->tnc_mutex);
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (!found) {
+ err = -ENOENT;
+ goto out;
+ } else if (found < 0) {
+ err = found;
+ goto out;
+ }
+ zt = &znode->zbranch[n];
+ if (is_hash_key(c, key)) {
+ /*
+ * In this case the leaf node cache gets used, so we pass the
+ * address of the zbranch and keep the mutex locked
+ */
+ *lnum = zt->lnum;
+ *offs = zt->offs;
+ err = tnc_read_node_nm(c, zt, node);
+ goto out;
+ }
+ zbr = znode->zbranch[n];
+ mutex_unlock(&c->tnc_mutex);
+
+ *lnum = zbr.lnum;
+ *offs = zbr.offs;
+
+ err = ubifs_tnc_read_node(c, &zbr, node);
+ return err;
+
+out:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * do_lookup_nm- look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, const struct qstr *nm)
+{
+ int found, n, err;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch zbr;
+
+ dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+ mutex_lock(&c->tnc_mutex);
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (!found) {
+ err = -ENOENT;
+ goto out_unlock;
+ } else if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ ubifs_assert(n >= 0);
+
+ err = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+ if (err == 0) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ zbr = znode->zbranch[n];
+ mutex_unlock(&c->tnc_mutex);
+
+ err = tnc_read_node_nm(c, &zbr, node);
+ return err;
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_lookup_nm - look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, const struct qstr *nm)
+{
+ int err, len;
+ const struct ubifs_dent_node *dent = node;
+
+ /*
+ * We assume that in most of the cases there are no name collisions and
+ * 'ubifs_tnc_lookup()' returns us the right direntry.
+ */
+ err = ubifs_tnc_lookup(c, key, node);
+ if (err)
+ return err;
+
+ len = le16_to_cpu(dent->nlen);
+ if (nm->len == len && !memcmp(dent->name, nm->name, len))
+ return 0;
+
+ /*
+ * Unluckily, there are hash collisions and we have to iterate over
+ * them look at each direntry with colliding name hash sequentially.
+ */
+ return do_lookup_nm(c, key, node, nm);
+}
+
+/**
+ * correct_parent_keys - correct parent znodes' keys.
+ * @c: UBIFS file-system description object
+ * @znode: znode to correct parent znodes for
+ *
+ * This is a helper function for 'tnc_insert()'. When the key of the leftmost
+ * zbranch changes, keys of parent znodes have to be corrected. This helper
+ * function is called in such situations and corrects the keys if needed.
+ */
+static void correct_parent_keys(const struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ union ubifs_key *key, *key1;
+
+ ubifs_assert(znode->parent);
+ ubifs_assert(znode->iip == 0);
+
+ key = &znode->zbranch[0].key;
+ key1 = &znode->parent->zbranch[0].key;
+
+ while (keys_cmp(c, key, key1) < 0) {
+ key_copy(c, key, key1);
+ znode = znode->parent;
+ znode->alt = 1;
+ if (!znode->parent || znode->iip)
+ break;
+ key1 = &znode->parent->zbranch[0].key;
+ }
+}
+
+/**
+ * insert_zbranch - insert a zbranch into a znode.
+ * @znode: znode into which to insert
+ * @zbr: zbranch to insert
+ * @n: slot number to insert to
+ *
+ * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
+ * znode's array of zbranches and keeps zbranches consolidated, so when a new
+ * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
+ * slot, zbranches starting from @n have to be moved right.
+ */
+static void insert_zbranch(struct ubifs_znode *znode,
+ const struct ubifs_zbranch *zbr, int n)
+{
+ int i;
+
+ ubifs_assert(ubifs_zn_dirty(znode));
+
+ if (znode->level) {
+ for (i = znode->child_cnt; i > n; i--) {
+ znode->zbranch[i] = znode->zbranch[i - 1];
+ if (znode->zbranch[i].znode)
+ znode->zbranch[i].znode->iip = i;
+ }
+ if (zbr->znode)
+ zbr->znode->iip = n;
+ } else
+ for (i = znode->child_cnt; i > n; i--)
+ znode->zbranch[i] = znode->zbranch[i - 1];
+
+ znode->zbranch[n] = *zbr;
+ znode->child_cnt += 1;
+
+ /*
+ * After inserting at slot zero, the lower bound of the key range of
+ * this znode may have changed. If this znode is subsequently split
+ * then the upper bound of the key range may change, and furthermore
+ * it could change to be lower than the original lower bound. If that
+ * happens, then it will no longer be possible to find this znode in the
+ * TNC using the key from the index node on flash. That is bad because
+ * if it is not found, we will assume it is obsolete and may overwrite
+ * it. Then if there is an unclean unmount, we will start using the
+ * old index which will be broken.
+ *
+ * So we first mark znodes that have insertions at slot zero, and then
+ * if they are split we add their lnum/offs to the old_idx tree.
+ */
+ if (n == 0)
+ znode->alt = 1;
+}
+
+/**
+ * tnc_insert - insert a node into TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to insert into
+ * @zbr: branch to insert
+ * @n: slot number to insert new zbranch to
+ *
+ * This function inserts a new node described by @zbr into znode @znode. If
+ * znode does not have a free slot for new zbranch, it is split. Parent znodes
+ * are splat as well if needed. Returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
+ struct ubifs_zbranch *zbr, int n)
+{
+ struct ubifs_znode *zn, *zi, *zp;
+ int i, keep, move, appending = 0;
+ union ubifs_key *key = &zbr->key;
+
+ ubifs_assert(n >= 0 && n <= c->fanout);
+
+ /* Implement naive insert for now */
+again:
+ zp = znode->parent;
+ if (znode->child_cnt < c->fanout) {
+ ubifs_assert(n != c->fanout);
+ dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
+ DBGKEY(key));
+
+ insert_zbranch(znode, zbr, n);
+
+ /* Ensure parent's key is correct */
+ if (n == 0 && zp && znode->iip == 0)
+ correct_parent_keys(c, znode);
+
+ return 0;
+ }
+
+ /*
+ * Unfortunately, @znode does not have more empty slots and we have to
+ * split it.
+ */
+ dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+
+ if (znode->alt)
+ /*
+ * We can no longer be sure of finding this znode by key, so we
+ * record it in the old_idx tree.
+ */
+ ins_clr_old_idx_znode(c, znode);
+
+ zn = kzalloc(c->max_znode_sz, GFP_NOFS);
+ if (!zn)
+ return -ENOMEM;
+ zn->parent = zp;
+ zn->level = znode->level;
+
+ /* Decide where to split */
+ if (znode->level == 0 && n == c->fanout &&
+ key_type(c, key) == UBIFS_DATA_KEY) {
+ union ubifs_key *key1;
+
+ /*
+ * If this is an inode which is being appended - do not split
+ * it because no other zbranches can be inserted between
+ * zbranches of consecutive data nodes anyway.
+ */
+ key1 = &znode->zbranch[n - 1].key;
+ if (key_inum(c, key1) == key_inum(c, key) &&
+ key_type(c, key1) == UBIFS_DATA_KEY &&
+ key_block(c, key1) == key_block(c, key) - 1)
+ appending = 1;
+ }
+
+ if (appending) {
+ keep = c->fanout;
+ move = 0;
+ } else {
+ keep = (c->fanout + 1) / 2;
+ move = c->fanout - keep;
+ }
+
+ /*
+ * Although we don't at present, we could look at the neighbors and see
+ * if we can move some zbranches there.
+ */
+
+ if (n < keep) {
+ /* Insert into existing znode */
+ zi = znode;
+ move += 1;
+ keep -= 1;
+ } else {
+ /* Insert into new znode */
+ zi = zn;
+ n -= keep;
+ /* Re-parent */
+ if (zn->level != 0)
+ zbr->znode->parent = zn;
+ }
+
+ __set_bit(DIRTY_ZNODE, &zn->flags);
+ atomic_long_inc(&c->dirty_zn_cnt);
+
+ zn->child_cnt = move;
+ znode->child_cnt = keep;
+
+ dbg_tnc("moving %d, keeping %d", move, keep);
+
+ /* Move zbranch */
+ for (i = 0; i < move; i++) {
+ zn->zbranch[i] = znode->zbranch[keep + i];
+ /* Re-parent */
+ if (zn->level != 0)
+ if (zn->zbranch[i].znode) {
+ zn->zbranch[i].znode->parent = zn;
+ zn->zbranch[i].znode->iip = i;
+ }
+ }
+
+ /* Insert new key and branch */
+ dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+
+ insert_zbranch(zi, zbr, n);
+
+ /* Insert new znode (produced by spitting) into the parent */
+ if (zp) {
+ i = n;
+ /* Locate insertion point */
+ n = znode->iip + 1;
+ if (appending && n != c->fanout)
+ appending = 0;
+
+ if (i == 0 && zi == znode && znode->iip == 0)
+ correct_parent_keys(c, znode);
+
+ /* Tail recursion */
+ zbr->key = zn->zbranch[0].key;
+ zbr->znode = zn;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+ znode = zp;
+
+ goto again;
+ }
+
+ /* We have to split root znode */
+ dbg_tnc("creating new zroot at level %d", znode->level + 1);
+
+ zi = kzalloc(c->max_znode_sz, GFP_NOFS);
+ if (!zi)
+ return -ENOMEM;
+
+ zi->child_cnt = 2;
+ zi->level = znode->level + 1;
+
+ __set_bit(DIRTY_ZNODE, &zi->flags);
+ atomic_long_inc(&c->dirty_zn_cnt);
+
+ zi->zbranch[0].key = znode->zbranch[0].key;
+ zi->zbranch[0].znode = znode;
+ zi->zbranch[0].lnum = c->zroot.lnum;
+ zi->zbranch[0].offs = c->zroot.offs;
+ zi->zbranch[0].len = c->zroot.len;
+ zi->zbranch[1].key = zn->zbranch[0].key;
+ zi->zbranch[1].znode = zn;
+
+ c->zroot.lnum = 0;
+ c->zroot.offs = 0;
+ c->zroot.len = 0;
+ c->zroot.znode = zi;
+
+ zn->parent = zi;
+ zn->iip = 1;
+ znode->parent = zi;
+ znode->iip = 0;
+
+ return 0;
+}
+
+/**
+ * ubifs_tnc_add - add a node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function adds a node with key @key to TNC. The node may be new or it may
+ * obsolete some existing one. Returns %0 on success or negative error code on
+ * failure.
+ */
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+ int offs, int len)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (!found) {
+ struct ubifs_zbranch zbr;
+
+ zbr.znode = NULL;
+ zbr.lnum = lnum;
+ zbr.offs = offs;
+ zbr.len = len;
+ key_copy(c, key, &zbr.key);
+ err = tnc_insert(c, znode, &zbr, n + 1);
+ } else if (found == 1) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ } else
+ err = found;
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+
+ return err;
+}
+
+/**
+ * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @old_lnum: LEB number of old node
+ * @old_offs: old node offset
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function replaces a node with key @key in the TNC only if the old node
+ * is found. This function is called by garbage collection when node are moved.
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+ int old_lnum, int old_offs, int lnum, int offs, int len)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
+ old_offs, lnum, offs, len, DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ if (found == 1) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ found = 0;
+ if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ if (err)
+ goto out_unlock;
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ found = 1;
+ } else if (is_hash_key(c, key)) {
+ found = resolve_collision_directly(c, key, &znode, &n,
+ old_lnum, old_offs);
+ dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
+ found, znode, n, old_lnum, old_offs);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ if (found) {
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c,
+ znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+ zbr = &znode->zbranch[n];
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum,
+ zbr->len);
+ if (err)
+ goto out_unlock;
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ }
+ }
+ }
+
+ if (!found)
+ err = ubifs_add_dirt(c, lnum, len);
+
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_add_nm - add a "hashed" node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ * @nm: node name
+ *
+ * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
+ * may have collisions, like directory entry keys.
+ */
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+ int lnum, int offs, int len, const struct qstr *nm)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
+ DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ if (found == 1) {
+ if (c->replaying)
+ found = fallible_resolve_collision(c, key, &znode, &n,
+ nm, 1);
+ else
+ found = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+
+ if (found == 1) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ goto out_unlock;
+ }
+ }
+
+ if (!found) {
+ struct ubifs_zbranch zbr;
+
+ zbr.znode = NULL;
+ zbr.lnum = lnum;
+ zbr.offs = offs;
+ zbr.len = len;
+ key_copy(c, key, &zbr.key);
+ err = tnc_insert(c, znode, &zbr, n + 1);
+ if (err)
+ goto out_unlock;
+ if (c->replaying) {
+ /*
+ * We did not find it in the index so there may be a
+ * dangling branch still in the index. So we remove it
+ * by passing 'ubifs_tnc_remove_nm()' the same key but
+ * an unmatchable name.
+ */
+ struct qstr noname = { .len = 0, .name = "" };
+
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ if (err)
+ return err;
+ return ubifs_tnc_remove_nm(c, key, &noname);
+ }
+ }
+
+out_unlock:
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * tnc_delete - delete a znode form TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to delete from
+ * @n: zbranch slot number to delete
+ *
+ * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
+{
+ struct ubifs_zbranch *zbr;
+ struct ubifs_znode *zp;
+ int i, err;
+
+ /* Delete without merge for now */
+ ubifs_assert(znode->level == 0);
+ ubifs_assert(n >= 0 && n < c->fanout);
+ dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+
+ zbr = &znode->zbranch[n];
+ lnc_free(zbr);
+
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ if (err) {
+ dbg_dump_znode(c, znode);
+ return err;
+ }
+
+ /* We do not "gap" zbranch slots */
+ for (i = n; i < znode->child_cnt - 1; i++)
+ znode->zbranch[i] = znode->zbranch[i + 1];
+ znode->child_cnt -= 1;
+
+ if (znode->child_cnt > 0)
+ return 0;
+
+ /*
+ * This was the last zbranch, we have to delete this znode from the
+ * parent.
+ */
+
+ do {
+ ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+ ubifs_assert(ubifs_zn_dirty(znode));
+
+ zp = znode->parent;
+ n = znode->iip;
+
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ err = insert_old_idx_znode(c, znode);
+ if (err)
+ return err;
+
+ if (znode->cnext) {
+ __set_bit(OBSOLETE_ZNODE, &znode->flags);
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ } else
+ kfree(znode);
+ znode = zp;
+ } while (znode->child_cnt == 1); /* while removing last child */
+
+ /* Remove from znode, entry n - 1 */
+ znode->child_cnt -= 1;
+ ubifs_assert(znode->level != 0);
+ for (i = n; i < znode->child_cnt; i++) {
+ znode->zbranch[i] = znode->zbranch[i + 1];
+ if (znode->zbranch[i].znode)
+ znode->zbranch[i].znode->iip = i;
+ }
+
+ /*
+ * If this is the root and it has only 1 child then
+ * collapse the tree.
+ */
+ if (!znode->parent) {
+ while (znode->child_cnt == 1 && znode->level != 0) {
+ zp = znode;
+ zbr = &znode->zbranch[0];
+ znode = get_znode(c, znode, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ znode = dirty_cow_znode(c, zbr);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ znode->parent = NULL;
+ znode->iip = 0;
+ if (c->zroot.len) {
+ err = insert_old_idx(c, c->zroot.lnum,
+ c->zroot.offs);
+ if (err)
+ return err;
+ }
+ c->zroot.lnum = zbr->lnum;
+ c->zroot.offs = zbr->offs;
+ c->zroot.len = zbr->len;
+ c->zroot.znode = znode;
+ ubifs_assert(!test_bit(OBSOLETE_ZNODE,
+ &zp->flags));
+ ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ if (zp->cnext) {
+ __set_bit(OBSOLETE_ZNODE, &zp->flags);
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ } else
+ kfree(zp);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_tnc_remove - remove an index entry of a node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("key %s", DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+ if (found == 1)
+ err = tnc_delete(c, znode, n);
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ * @nm: directory entry name
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+ const struct qstr *nm)
+{
+ int n, err;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+ err = lookup_level0_dirty(c, key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err) {
+ if (c->replaying)
+ err = fallible_resolve_collision(c, key, &znode, &n,
+ nm, 0);
+ else
+ err = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+ if (err < 0)
+ goto out_unlock;
+ if (err) {
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+ err = tnc_delete(c, znode, n);
+ }
+ }
+
+out_unlock:
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * key_in_range - determine if a key falls within a range of keys.
+ * @c: UBIFS file-system description object
+ * @key: key to check
+ * @from_key: lowest key in range
+ * @to_key: highest key in range
+ *
+ * This function returns %1 if the key is in range and %0 otherwise.
+ */
+static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
+ union ubifs_key *from_key, union ubifs_key *to_key)
+{
+ if (keys_cmp(c, key, from_key) < 0)
+ return 0;
+ if (keys_cmp(c, key, to_key) > 0)
+ return 0;
+ return 1;
+}
+
+/**
+ * ubifs_tnc_remove_range - remove index entries in range.
+ * @c: UBIFS file-system description object
+ * @from_key: lowest key to remove
+ * @to_key: highest key to remove
+ *
+ * This function removes index entries starting at @from_key and ending at
+ * @to_key. This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+ union ubifs_key *to_key)
+{
+ int i, n, k, err = 0;
+ struct ubifs_znode *znode;
+ union ubifs_key *key;
+
+ mutex_lock(&c->tnc_mutex);
+ while (1) {
+ /* Find first level 0 znode that contains keys to remove */
+ err = ubifs_lookup_level0(c, from_key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err)
+ key = from_key;
+ else {
+ err = tnc_next(c, &znode, &n);
+ if (err == -ENOENT) {
+ err = 0;
+ goto out_unlock;
+ }
+ if (err < 0)
+ goto out_unlock;
+ key = &znode->zbranch[n].key;
+ if (!key_in_range(c, key, from_key, to_key)) {
+ err = 0;
+ goto out_unlock;
+ }
+ }
+
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+
+ /* Remove all keys in range except the first */
+ for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
+ key = &znode->zbranch[i].key;
+ if (!key_in_range(c, key, from_key, to_key))
+ break;
+ lnc_free(&znode->zbranch[i]);
+ err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
+ znode->zbranch[i].len);
+ if (err) {
+ dbg_dump_znode(c, znode);
+ goto out_unlock;
+ }
+ dbg_tnc("removing %s", DBGKEY(key));
+ }
+ if (k) {
+ for (i = n + 1 + k; i < znode->child_cnt; i++)
+ znode->zbranch[i - k] = znode->zbranch[i];
+ znode->child_cnt -= k;
+ }
+
+ /* Now delete the first */
+ err = tnc_delete(c, znode, n);
+ if (err)
+ goto out_unlock;
+ }
+
+out_unlock:
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_remove_ino - remove an inode from TNC.
+ * @c: UBIFS file-system description object
+ * @inum: inode number to remove
+ *
+ * This function remove inode @inum and all the extended attributes associated
+ * with the anode from TNC and returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
+{
+ union ubifs_key key1, key2;
+ struct ubifs_dent_node *xent, *pxent = NULL;
+ struct qstr nm = { .name = NULL };
+
+ dbg_tnc("ino %lu", inum);
+
+ /*
+ * Walk all extended attribute entries and remove them together with
+ * corresponding extended attribute inodes.
+ */
+ lowest_xent_key(c, &key1, inum);
+ while (1) {
+ ino_t xattr_inum;
+ int err;
+
+ xent = ubifs_tnc_next_ent(c, &key1, &nm);
+ if (IS_ERR(xent)) {
+ err = PTR_ERR(xent);
+ if (err == -ENOENT)
+ break;
+ return err;
+ }
+
+ xattr_inum = le64_to_cpu(xent->inum);
+ dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
+
+ nm.name = xent->name;
+ nm.len = le16_to_cpu(xent->nlen);
+ err = ubifs_tnc_remove_nm(c, &key1, &nm);
+ if (err) {
+ kfree(xent);
+ return err;
+ }
+
+ lowest_ino_key(c, &key1, xattr_inum);
+ highest_ino_key(c, &key2, xattr_inum);
+ err = ubifs_tnc_remove_range(c, &key1, &key2);
+ if (err) {
+ kfree(xent);
+ return err;
+ }
+
+ kfree(pxent);
+ pxent = xent;
+ key_read(c, &xent->key, &key1);
+ }
+
+ kfree(pxent);
+ lowest_ino_key(c, &key1, inum);
+ highest_ino_key(c, &key2, inum);
+
+ return ubifs_tnc_remove_range(c, &key1, &key2);
+}
+
+/**
+ * ubifs_tnc_next_ent - walk directory or extended attribute entries.
+ * @c: UBIFS file-system description object
+ * @key: key of last entry
+ * @nm: name of last entry found or %NULL
+ *
+ * This function finds and reads the next directory or extended attribute entry
+ * after the given key (@key) if there is one. @nm is used to resolve
+ * collisions.
+ *
+ * If the name of the current entry is not known and only the key is known,
+ * @nm->name has to be %NULL. In this case the semantics of this function is a
+ * little bit different and it returns the entry corresponding to this key, not
+ * the next one. If the key was not found, the closest "right" entry is
+ * returned.
+ *
+ * If the fist entry has to be found, @key has to contain the lowest possible
+ * key value for this inode and @name has to be %NULL.
+ *
+ * This function returns the found directory or extended attribute entry node
+ * in case of success, %-ENOENT is returned if no entry was found, and a
+ * negative error code is returned in case of failure.
+ */
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+ union ubifs_key *key,
+ const struct qstr *nm)
+{
+ int n, err, type = key_type(c, key);
+ struct ubifs_znode *znode;
+ struct ubifs_dent_node *dent;
+ struct ubifs_zbranch *zbr;
+ union ubifs_key *dkey;
+
+ dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+ ubifs_assert(is_hash_key(c, key));
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, key, &znode, &n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+
+ if (nm->name) {
+ if (err) {
+ /* Handle collisions */
+ err = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d",
+ err, znode, n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+ }
+
+ /* Now find next entry */
+ err = tnc_next(c, &znode, &n);
+ if (unlikely(err))
+ goto out_unlock;
+ } else {
+ /*
+ * The full name of the entry was not given, in which case the
+ * behavior of this function is a little different and it
+ * returns current entry, not the next one.
+ */
+ if (!err) {
+ /*
+ * However, the given key does not exist in the TNC
+ * tree and @znode/@n variables contain the closest
+ * "preceding" element. Switch to the next one.
+ */
+ err = tnc_next(c, &znode, &n);
+ if (err)
+ goto out_unlock;
+ }
+ }
+
+ zbr = &znode->zbranch[n];
+ dent = kmalloc(zbr->len, GFP_NOFS);
+ if (unlikely(!dent)) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /*
+ * The above 'tnc_next()' call could lead us to the next inode, check
+ * this.
+ */
+ dkey = &zbr->key;
+ if (key_inum(c, dkey) != key_inum(c, key) ||
+ key_type(c, dkey) != type) {
+ err = -ENOENT;
+ goto out_free;
+ }
+
+ err = tnc_read_node_nm(c, zbr, dent);
+ if (unlikely(err))
+ goto out_free;
+
+ mutex_unlock(&c->tnc_mutex);
+ return dent;
+
+out_free:
+ kfree(dent);
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return ERR_PTR(err);
+}
+
+/**
+ * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy left-over obsolete znodes from a failed commit.
+ */
+static void tnc_destroy_cnext(struct ubifs_info *c)
+{
+ struct ubifs_znode *cnext;
+
+ if (!c->cnext)
+ return;
+ ubifs_assert(c->cmt_state == COMMIT_BROKEN);
+ cnext = c->cnext;
+ do {
+ struct ubifs_znode *znode = cnext;
+
+ cnext = cnext->cnext;
+ if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+ kfree(znode);
+ } while (cnext && cnext != c->cnext);
+}
+
+/**
+ * ubifs_tnc_close - close TNC subsystem and free all related resources.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_tnc_close(struct ubifs_info *c)
+{
+ long clean_freed;
+
+ tnc_destroy_cnext(c);
+ if (c->zroot.znode) {
+ clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+ atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+ }
+ kfree(c->gap_lebs);
+ kfree(c->ilebs);
+ destroy_old_idx(c);
+}
+
+/**
+ * left_znode - get the znode to the left.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the left of @znode or NULL if
+ * there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *left_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ int level = znode->level;
+
+ while (1) {
+ int n = znode->iip - 1;
+
+ /* Go up until we can go left */
+ znode = znode->parent;
+ if (!znode)
+ return NULL;
+ if (n >= 0) {
+ /* Now go down the rightmost branch to 'level' */
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ while (znode->level != level) {
+ n = znode->child_cnt - 1;
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ break;
+ }
+ }
+ return znode;
+}
+
+/**
+ * right_znode - get the znode to the right.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the right of @znode or NULL
+ * if there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *right_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ int level = znode->level;
+
+ while (1) {
+ int n = znode->iip + 1;
+
+ /* Go up until we can go right */
+ znode = znode->parent;
+ if (!znode)
+ return NULL;
+ if (n < znode->child_cnt) {
+ /* Now go down the leftmost branch to 'level' */
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ while (znode->level != level) {
+ znode = get_znode(c, znode, 0);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ break;
+ }
+ }
+ return znode;
+}
+
+/**
+ * lookup_znode - find a particular indexing node from TNC.
+ * @c: UBIFS file-system description object
+ * @key: index node key to lookup
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function searches an indexing node by its first key @key and its
+ * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
+ * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * were found on the media by scanning, for example when garbage-collecting or
+ * when doing in-the-gaps commit. This means that the indexing node which is
+ * looked for does not have to have exactly the same leftmost key @key, because
+ * the leftmost key may have been changed, in which case TNC will contain a
+ * dirty znode which still refers the same @lnum:@offs. This function is clever
+ * enough to recognize such indexing nodes.
+ *
+ * Note, if a znode was deleted or changed too much, then this function will
+ * not find it. For situations like this UBIFS has the old index RB-tree
+ * (indexed by @lnum:@offs).
+ *
+ * This function returns a pointer to the znode found or %NULL if it is not
+ * found. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
+ union ubifs_key *key, int level,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode, *zn;
+ int n, nn;
+
+ /*
+ * The arguments have probably been read off flash, so don't assume
+ * they are valid.
+ */
+ if (level < 0)
+ return ERR_PTR(-EINVAL);
+
+ /* Get the root znode */
+ znode = c->zroot.znode;
+ if (!znode) {
+ znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ /* Check if it is the one we are looking for */
+ if (c->zroot.lnum == lnum && c->zroot.offs == offs)
+ return znode;
+ /* Descend to the parent level i.e. (level + 1) */
+ if (level >= znode->level)
+ return NULL;
+ while (1) {
+ ubifs_search_zbranch(c, znode, key, &n);
+ if (n < 0) {
+ /*
+ * We reached a znode where the leftmost key is greater
+ * than the key we are searching for. This is the same
+ * situation as the one described in a huge comment at
+ * the end of the 'ubifs_lookup_level0()' function. And
+ * for exactly the same reasons we have to try to look
+ * left before giving up.
+ */
+ znode = left_znode(c, znode);
+ if (!znode)
+ return NULL;
+ if (IS_ERR(znode))
+ return znode;
+ ubifs_search_zbranch(c, znode, key, &n);
+ ubifs_assert(n >= 0);
+ }
+ if (znode->level == level + 1)
+ break;
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ /* Check if the child is the one we are looking for */
+ if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
+ return get_znode(c, znode, n);
+ /* If the key is unique, there is nowhere else to look */
+ if (!is_hash_key(c, key))
+ return NULL;
+ /*
+ * The key is not unique and so may be also in the znodes to either
+ * side.
+ */
+ zn = znode;
+ nn = n;
+ /* Look left */
+ while (1) {
+ /* Move one branch to the left */
+ if (n)
+ n -= 1;
+ else {
+ znode = left_znode(c, znode);
+ if (!znode)
+ break;
+ if (IS_ERR(znode))
+ return znode;
+ n = znode->child_cnt - 1;
+ }
+ /* Check it */
+ if (znode->zbranch[n].lnum == lnum &&
+ znode->zbranch[n].offs == offs)
+ return get_znode(c, znode, n);
+ /* Stop if the key is less than the one we are looking for */
+ if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
+ break;
+ }
+ /* Back to the middle */
+ znode = zn;
+ n = nn;
+ /* Look right */
+ while (1) {
+ /* Move one branch to the right */
+ if (++n >= znode->child_cnt) {
+ znode = right_znode(c, znode);
+ if (!znode)
+ break;
+ if (IS_ERR(znode))
+ return znode;
+ n = 0;
+ }
+ /* Check it */
+ if (znode->zbranch[n].lnum == lnum &&
+ znode->zbranch[n].offs == offs)
+ return get_znode(c, znode, n);
+ /* Stop if the key is greater than the one we are looking for */
+ if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
+ break;
+ }
+ return NULL;
+}
+
+/**
+ * is_idx_node_in_tnc - determine if an index node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * This function returns %0 if the index node is not referred to in the TNC, %1
+ * if the index node is referred to in the TNC and the corresponding znode is
+ * dirty, %2 if an index node is referred to in the TNC and the corresponding
+ * znode is clean, and a negative error code in case of failure.
+ *
+ * Note, the @key argument has to be the key of the first child. Also note,
+ * this function relies on the fact that 0:0 is never a valid LEB number and
+ * offset for a main-area node.
+ */
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode;
+
+ znode = lookup_znode(c, key, level, lnum, offs);
+ if (!znode)
+ return 0;
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+
+ return ubifs_zn_dirty(znode) ? 1 : 2;
+}
+
+/**
+ * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @lnum: node LEB number
+ * @offs: node offset
+ *
+ * This function returns %1 if the node is referred to in the TNC, %0 if it is
+ * not, and a negative error code in case of failure.
+ *
+ * Note, this function relies on the fact that 0:0 is never a valid LEB number
+ * and offset for a main-area node.
+ */
+static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
+ int lnum, int offs)
+{
+ struct ubifs_zbranch *zbr;
+ struct ubifs_znode *znode, *zn;
+ int n, found, err, nn;
+ const int unique = !is_hash_key(c, key);
+
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (found < 0)
+ return found; /* Error code */
+ if (!found)
+ return 0;
+ zbr = &znode->zbranch[n];
+ if (lnum == zbr->lnum && offs == zbr->offs)
+ return 1; /* Found it */
+ if (unique)
+ return 0;
+ /*
+ * Because the key is not unique, we have to look left
+ * and right as well
+ */
+ zn = znode;
+ nn = n;
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, &znode, &n);
+ if (err == -ENOENT)
+ break;
+ if (err)
+ return err;
+ if (keys_cmp(c, key, &znode->zbranch[n].key))
+ break;
+ zbr = &znode->zbranch[n];
+ if (lnum == zbr->lnum && offs == zbr->offs)
+ return 1; /* Found it */
+ }
+ /* Look right */
+ znode = zn;
+ n = nn;
+ while (1) {
+ err = tnc_next(c, &znode, &n);
+ if (err) {
+ if (err == -ENOENT)
+ return 0;
+ return err;
+ }
+ if (keys_cmp(c, key, &znode->zbranch[n].key))
+ break;
+ zbr = &znode->zbranch[n];
+ if (lnum == zbr->lnum && offs == zbr->offs)
+ return 1; /* Found it */
+ }
+ return 0;
+}
+
+/**
+ * ubifs_tnc_has_node - determine whether a node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @level: index node level (if it is an index node)
+ * @lnum: node LEB number
+ * @offs: node offset
+ * @is_idx: non-zero if the node is an index node
+ *
+ * This function returns %1 if the node is in the TNC, %0 if it is not, and a
+ * negative error code in case of failure. For index nodes, @key has to be the
+ * key of the first child. An index node is considered to be in the TNC only if
+ * the corresponding znode is clean or has not been loaded.
+ */
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs, int is_idx)
+{
+ int err;
+
+ mutex_lock(&c->tnc_mutex);
+ if (is_idx) {
+ err = is_idx_node_in_tnc(c, key, level, lnum, offs);
+ if (err < 0)
+ goto out_unlock;
+ if (err == 1)
+ /* The index node was found but it was dirty */
+ err = 0;
+ else if (err == 2)
+ /* The index node was found and it was clean */
+ err = 1;
+ else
+ BUG_ON(err != 0);
+ } else
+ err = is_leaf_node_in_tnc(c, key, lnum, offs);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_dirty_idx_node - dirty an index node.
+ * @c: UBIFS file-system description object
+ * @key: index node key
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function loads and dirties an index node so that it can be garbage
+ * collected. The @key argument has to be the key of the first child. This
+ * function relies on the fact that 0:0 is never a valid LEB number and offset
+ * for a main-area node. Returns %0 on success and a negative error code on
+ * failure.
+ */
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode;
+ int err = 0;
+
+ mutex_lock(&c->tnc_mutex);
+ znode = lookup_znode(c, key, level, lnum, offs);
+ if (!znode)
+ goto out_unlock;
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 00000000000..8117e65ba2e
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1103 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/* This file implements TNC functions for committing */
+
+#include "ubifs.h"
+
+/**
+ * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
+ * @c: UBIFS file-system description object
+ * @idx: buffer in which to place new index node
+ * @znode: znode from which to make new index node
+ * @lnum: LEB number where new index node will be written
+ * @offs: offset where new index node will be written
+ * @len: length of new index node
+ */
+static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
+ struct ubifs_znode *znode, int lnum, int offs, int len)
+{
+ struct ubifs_znode *zp;
+ int i, err;
+
+ /* Make index node */
+ idx->ch.node_type = UBIFS_IDX_NODE;
+ idx->child_cnt = cpu_to_le16(znode->child_cnt);
+ idx->level = cpu_to_le16(znode->level);
+ for (i = 0; i < znode->child_cnt; i++) {
+ struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ key_write_idx(c, &zbr->key, &br->key);
+ br->lnum = cpu_to_le32(zbr->lnum);
+ br->offs = cpu_to_le32(zbr->offs);
+ br->len = cpu_to_le32(zbr->len);
+ if (!zbr->lnum || !zbr->len) {
+ ubifs_err("bad ref in znode");
+ dbg_dump_znode(c, znode);
+ if (zbr->znode)
+ dbg_dump_znode(c, zbr->znode);
+ }
+ }
+ ubifs_prepare_node(c, idx, len, 0);
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ znode->lnum = lnum;
+ znode->offs = offs;
+ znode->len = len;
+#endif
+
+ err = insert_old_idx_znode(c, znode);
+
+ /* Update the parent */
+ zp = znode->parent;
+ if (zp) {
+ struct ubifs_zbranch *zbr;
+
+ zbr = &zp->zbranch[znode->iip];
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ } else {
+ c->zroot.lnum = lnum;
+ c->zroot.offs = offs;
+ c->zroot.len = len;
+ }
+ c->calc_idx_sz += ALIGN(len, 8);
+
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ ubifs_assert(ubifs_zn_dirty(znode));
+ ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+ __clear_bit(DIRTY_ZNODE, &znode->flags);
+ __clear_bit(COW_ZNODE, &znode->flags);
+
+ return err;
+}
+
+/**
+ * fill_gap - make index nodes in gaps in dirty index LEBs.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number that gap appears in
+ * @gap_start: offset of start of gap
+ * @gap_end: offset of end of gap
+ * @dirt: adds dirty space to this
+ *
+ * This function returns the number of index nodes written into the gap.
+ */
+static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
+ int *dirt)
+{
+ int len, gap_remains, gap_pos, written, pad_len;
+
+ ubifs_assert((gap_start & 7) == 0);
+ ubifs_assert((gap_end & 7) == 0);
+ ubifs_assert(gap_end >= gap_start);
+
+ gap_remains = gap_end - gap_start;
+ if (!gap_remains)
+ return 0;
+ gap_pos = gap_start;
+ written = 0;
+ while (c->enext) {
+ len = ubifs_idx_node_sz(c, c->enext->child_cnt);
+ if (len < gap_remains) {
+ struct ubifs_znode *znode = c->enext;
+ const int alen = ALIGN(len, 8);
+ int err;
+
+ ubifs_assert(alen <= gap_remains);
+ err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
+ lnum, gap_pos, len);
+ if (err)
+ return err;
+ gap_remains -= alen;
+ gap_pos += alen;
+ c->enext = znode->cnext;
+ if (c->enext == c->cnext)
+ c->enext = NULL;
+ written += 1;
+ } else
+ break;
+ }
+ if (gap_end == c->leb_size) {
+ c->ileb_len = ALIGN(gap_pos, c->min_io_size);
+ /* Pad to end of min_io_size */
+ pad_len = c->ileb_len - gap_pos;
+ } else
+ /* Pad to end of gap */
+ pad_len = gap_remains;
+ dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
+ lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
+ ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
+ *dirt += pad_len;
+ return written;
+}
+
+/**
+ * find_old_idx - find an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %1 if found and %0 otherwise.
+ */
+static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+ struct ubifs_old_idx *o;
+ struct rb_node *p;
+
+ p = c->old_idx.rb_node;
+ while (p) {
+ o = rb_entry(p, struct ubifs_old_idx, rb);
+ if (lnum < o->lnum)
+ p = p->rb_left;
+ else if (lnum > o->lnum)
+ p = p->rb_right;
+ else if (offs < o->offs)
+ p = p->rb_left;
+ else if (offs > o->offs)
+ p = p->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * is_idx_node_in_use - determine if an index node can be overwritten.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * If @key / @lnum / @offs identify an index node that was not part of the old
+ * index, then this function returns %0 (obsolete). Else if the index node was
+ * part of the old index but is now dirty %1 is returned, else if it is clean %2
+ * is returned. A negative error code is returned on failure.
+ */
+static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
+ int level, int lnum, int offs)
+{
+ int ret;
+
+ ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
+ if (ret < 0)
+ return ret; /* Error code */
+ if (ret == 0)
+ if (find_old_idx(c, lnum, offs))
+ return 1;
+ return ret;
+}
+
+/**
+ * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
+ * @c: UBIFS file-system description object
+ * @p: return LEB number here
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ * This function merely puts the next znode into the next gap, making no attempt
+ * to try to maximise the number of znodes that fit.
+ * This function returns the number of index nodes written into the gaps, or a
+ * negative error code on failure.
+ */
+static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
+
+ tot_written = 0;
+ /* Get an index LEB with lots of obsolete index nodes */
+ lnum = ubifs_find_dirty_idx_leb(c);
+ if (lnum < 0)
+ /*
+ * There also may be dirt in the index head that could be
+ * filled, however we do not check there at present.
+ */
+ return lnum; /* Error code */
+ *p = lnum;
+ dbg_gc("LEB %d", lnum);
+ /*
+ * Scan the index LEB. We use the generic scan for this even though
+ * it is more comprehensive and less efficient than is needed for this
+ * purpose.
+ */
+ sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+ c->ileb_len = 0;
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ gap_start = 0;
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ struct ubifs_idx_node *idx;
+ int in_use, level;
+
+ ubifs_assert(snod->type == UBIFS_IDX_NODE);
+ idx = snod->node;
+ key_read(c, ubifs_idx_key(c, idx), &snod->key);
+ level = le16_to_cpu(idx->level);
+ /* Determine if the index node is in use (not obsolete) */
+ in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
+ snod->offs);
+ if (in_use < 0) {
+ ubifs_scan_destroy(sleb);
+ return in_use; /* Error code */
+ }
+ if (in_use) {
+ if (in_use == 1)
+ dirt += ALIGN(snod->len, 8);
+ /*
+ * The obsolete index nodes form gaps that can be
+ * overwritten. This gap has ended because we have
+ * found an index node that is still in use
+ * i.e. not obsolete
+ */
+ gap_end = snod->offs;
+ /* Try to fill gap */
+ written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+ if (written < 0) {
+ ubifs_scan_destroy(sleb);
+ return written; /* Error code */
+ }
+ tot_written += written;
+ gap_start = ALIGN(snod->offs + snod->len, 8);
+ }
+ }
+ ubifs_scan_destroy(sleb);
+ c->ileb_len = c->leb_size;
+ gap_end = c->leb_size;
+ /* Try to fill gap */
+ written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+ if (written < 0)
+ return written; /* Error code */
+ tot_written += written;
+ if (tot_written == 0) {
+ struct ubifs_lprops lp;
+
+ dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+ err = ubifs_read_one_lp(c, lnum, &lp);
+ if (err)
+ return err;
+ if (lp.free == c->leb_size) {
+ /*
+ * We must have snatched this LEB from the idx_gc list
+ * so we need to correct the free and dirty space.
+ */
+ err = ubifs_change_one_lp(c, lnum,
+ c->leb_size - c->ileb_len,
+ dirt, 0, 0, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+ }
+ err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
+ 0, 0, 0);
+ if (err)
+ return err;
+ err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+ return tot_written;
+}
+
+/**
+ * get_leb_cnt - calculate the number of empty LEBs needed to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns the number of empty LEBs needed to commit @cnt znodes
+ * to the current index head. The number is not exact and may be more than
+ * needed.
+ */
+static int get_leb_cnt(struct ubifs_info *c, int cnt)
+{
+ int d;
+
+ /* Assume maximum index node size (i.e. overestimate space needed) */
+ cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
+ if (cnt < 0)
+ cnt = 0;
+ d = c->leb_size / c->max_idx_node_sz;
+ return DIV_ROUND_UP(cnt, d);
+}
+
+/**
+ * layout_in_gaps - in-the-gaps method of committing TNC.
+ * @c: UBIFS file-system description object
+ * @cnt: number of dirty znodes to commit.
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_gaps(struct ubifs_info *c, int cnt)
+{
+ int err, leb_needed_cnt, written, *p;
+
+ dbg_gc("%d znodes to write", cnt);
+
+ c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
+ if (!c->gap_lebs)
+ return -ENOMEM;
+
+ p = c->gap_lebs;
+ do {
+ ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+ written = layout_leb_in_gaps(c, p);
+ if (written < 0) {
+ err = written;
+ if (err == -ENOSPC) {
+ if (!dbg_force_in_the_gaps_enabled) {
+ /*
+ * Do not print scary warnings if the
+ * debugging option which forces
+ * in-the-gaps is enabled.
+ */
+ ubifs_err("out of space");
+ spin_lock(&c->space_lock);
+ dbg_dump_budg(c);
+ spin_unlock(&c->space_lock);
+ dbg_dump_lprops(c);
+ }
+ /* Try to commit anyway */
+ err = 0;
+ break;
+ }
+ kfree(c->gap_lebs);
+ c->gap_lebs = NULL;
+ return err;
+ }
+ p++;
+ cnt -= written;
+ leb_needed_cnt = get_leb_cnt(c, cnt);
+ dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
+ leb_needed_cnt, c->ileb_cnt);
+ } while (leb_needed_cnt > c->ileb_cnt);
+
+ *p = -1;
+ return 0;
+}
+
+/**
+ * layout_in_empty_space - layout index nodes in empty space.
+ * @c: UBIFS file-system description object
+ *
+ * This function lays out new index nodes for dirty znodes using empty LEBs.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_empty_space(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode, *cnext, *zp;
+ int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
+ int wlen, blen, err;
+
+ cnext = c->enext;
+ if (!cnext)
+ return 0;
+
+ lnum = c->ihead_lnum;
+ buf_offs = c->ihead_offs;
+
+ buf_len = ubifs_idx_node_sz(c, c->fanout);
+ buf_len = ALIGN(buf_len, c->min_io_size);
+ used = 0;
+ avail = buf_len;
+
+ /* Ensure there is enough room for first write */
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+ if (buf_offs + next_len > c->leb_size)
+ lnum = -1;
+
+ while (1) {
+ znode = cnext;
+
+ len = ubifs_idx_node_sz(c, znode->child_cnt);
+
+ /* Determine the index node position */
+ if (lnum == -1) {
+ if (c->ileb_nxt >= c->ileb_cnt) {
+ ubifs_err("out of space");
+ return -ENOSPC;
+ }
+ lnum = c->ilebs[c->ileb_nxt++];
+ buf_offs = 0;
+ used = 0;
+ avail = buf_len;
+ }
+
+ offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ znode->lnum = lnum;
+ znode->offs = offs;
+ znode->len = len;
+#endif
+
+ /* Update the parent */
+ zp = znode->parent;
+ if (zp) {
+ struct ubifs_zbranch *zbr;
+ int i;
+
+ i = znode->iip;
+ zbr = &zp->zbranch[i];
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ } else {
+ c->zroot.lnum = lnum;
+ c->zroot.offs = offs;
+ c->zroot.len = len;
+ }
+ c->calc_idx_sz += ALIGN(len, 8);
+
+ /*
+ * Once lprops is updated, we can decrease the dirty znode count
+ * but it is easier to just do it here.
+ */
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ /*
+ * Calculate the next index node length to see if there is
+ * enough room for it
+ */
+ cnext = znode->cnext;
+ if (cnext == c->cnext)
+ next_len = 0;
+ else
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+ if (c->min_io_size == 1) {
+ buf_offs += ALIGN(len, 8);
+ if (next_len) {
+ if (buf_offs + next_len <= c->leb_size)
+ continue;
+ err = ubifs_update_one_lp(c, lnum, 0,
+ c->leb_size - buf_offs, 0, 0);
+ if (err)
+ return err;
+ lnum = -1;
+ continue;
+ }
+ err = ubifs_update_one_lp(c, lnum,
+ c->leb_size - buf_offs, 0, 0, 0);
+ if (err)
+ return err;
+ break;
+ }
+
+ /* Update buffer positions */
+ wlen = used + len;
+ used += ALIGN(len, 8);
+ avail -= ALIGN(len, 8);
+
+ if (next_len != 0 &&
+ buf_offs + used + next_len <= c->leb_size &&
+ avail > 0)
+ continue;
+
+ if (avail <= 0 && next_len &&
+ buf_offs + used + next_len <= c->leb_size)
+ blen = buf_len;
+ else
+ blen = ALIGN(wlen, c->min_io_size);
+
+ /* The buffer is full or there are no more znodes to do */
+ buf_offs += blen;
+ if (next_len) {
+ if (buf_offs + next_len > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum,
+ c->leb_size - buf_offs, blen - used,
+ 0, 0);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+ used -= blen;
+ if (used < 0)
+ used = 0;
+ avail = buf_len - used;
+ continue;
+ }
+ err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
+ blen - used, 0, 0);
+ if (err)
+ return err;
+ break;
+ }
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ c->new_ihead_lnum = lnum;
+ c->new_ihead_offs = buf_offs;
+#endif
+
+ return 0;
+}
+
+/**
+ * layout_commit - determine positions of index nodes to commit.
+ * @c: UBIFS file-system description object
+ * @no_space: indicates that insufficient empty LEBs were allocated
+ * @cnt: number of znodes to commit
+ *
+ * Calculate and update the positions of index nodes to commit. If there were
+ * an insufficient number of empty LEBs allocated, then index nodes are placed
+ * into the gaps created by obsolete index nodes in non-empty index LEBs. For
+ * this purpose, an obsolete index node is one that was not in the index as at
+ * the end of the last commit. To write "in-the-gaps" requires that those index
+ * LEBs are updated atomically in-place.
+ */
+static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
+{
+ int err;
+
+ if (no_space) {
+ err = layout_in_gaps(c, cnt);
+ if (err)
+ return err;
+ }
+ err = layout_in_empty_space(c);
+ return err;
+}
+
+/**
+ * find_first_dirty - find first dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
+{
+ int i, cont;
+
+ if (!znode)
+ return NULL;
+
+ while (1) {
+ if (znode->level == 0) {
+ if (ubifs_zn_dirty(znode))
+ return znode;
+ return NULL;
+ }
+ cont = 0;
+ for (i = 0; i < znode->child_cnt; i++) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
+ znode = zbr->znode;
+ cont = 1;
+ break;
+ }
+ }
+ if (!cont) {
+ if (ubifs_zn_dirty(znode))
+ return znode;
+ return NULL;
+ }
+ }
+}
+
+/**
+ * find_next_dirty - find next dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
+{
+ int n = znode->iip + 1;
+
+ znode = znode->parent;
+ if (!znode)
+ return NULL;
+ for (; n < znode->child_cnt; n++) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ if (zbr->znode && ubifs_zn_dirty(zbr->znode))
+ return find_first_dirty(zbr->znode);
+ }
+ return znode;
+}
+
+/**
+ * get_znodes_to_commit - create list of dirty znodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of znodes to commit.
+ */
+static int get_znodes_to_commit(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode, *cnext;
+ int cnt = 0;
+
+ c->cnext = find_first_dirty(c->zroot.znode);
+ znode = c->enext = c->cnext;
+ if (!znode) {
+ dbg_cmt("no znodes to commit");
+ return 0;
+ }
+ cnt += 1;
+ while (1) {
+ ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
+ __set_bit(COW_ZNODE, &znode->flags);
+ znode->alt = 0;
+ cnext = find_next_dirty(znode);
+ if (!cnext) {
+ znode->cnext = c->cnext;
+ break;
+ }
+ znode->cnext = cnext;
+ znode = cnext;
+ cnt += 1;
+ }
+ dbg_cmt("committing %d znodes", cnt);
+ ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
+ return cnt;
+}
+
+/**
+ * alloc_idx_lebs - allocate empty LEBs to be used to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns %-ENOSPC if it cannot allocate a sufficient number of
+ * empty LEBs. %0 is returned on success, otherwise a negative error code
+ * is returned.
+ */
+static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
+{
+ int i, leb_cnt, lnum;
+
+ c->ileb_cnt = 0;
+ c->ileb_nxt = 0;
+ leb_cnt = get_leb_cnt(c, cnt);
+ dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
+ if (!leb_cnt)
+ return 0;
+ c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
+ if (!c->ilebs)
+ return -ENOMEM;
+ for (i = 0; i < leb_cnt; i++) {
+ lnum = ubifs_find_free_leb_for_idx(c);
+ if (lnum < 0)
+ return lnum;
+ c->ilebs[c->ileb_cnt++] = lnum;
+ dbg_cmt("LEB %d", lnum);
+ }
+ if (dbg_force_in_the_gaps())
+ return -ENOSPC;
+ return 0;
+}
+
+/**
+ * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
+ * @c: UBIFS file-system description object
+ *
+ * It is possible that we allocate more empty LEBs for the commit than we need.
+ * This functions frees the surplus.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_unused_idx_lebs(struct ubifs_info *c)
+{
+ int i, err = 0, lnum, er;
+
+ for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
+ lnum = c->ilebs[i];
+ dbg_cmt("LEB %d", lnum);
+ er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_INDEX | LPROPS_TAKEN, 0);
+ if (!err)
+ err = er;
+ }
+ return err;
+}
+
+/**
+ * free_idx_lebs - free unused LEBs after commit end.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_idx_lebs(struct ubifs_info *c)
+{
+ int err;
+
+ err = free_unused_idx_lebs(c);
+ kfree(c->ilebs);
+ c->ilebs = NULL;
+ return err;
+}
+
+/**
+ * ubifs_tnc_start_commit - start TNC commit.
+ * @c: UBIFS file-system description object
+ * @zroot: new index root position is returned here
+ *
+ * This function prepares the list of indexing nodes to commit and lays out
+ * their positions on flash. If there is not enough free space it uses the
+ * in-gap commit method. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+ int err = 0, cnt;
+
+ mutex_lock(&c->tnc_mutex);
+ err = dbg_check_tnc(c, 1);
+ if (err)
+ goto out;
+ cnt = get_znodes_to_commit(c);
+ if (cnt != 0) {
+ int no_space = 0;
+
+ err = alloc_idx_lebs(c, cnt);
+ if (err == -ENOSPC)
+ no_space = 1;
+ else if (err)
+ goto out_free;
+ err = layout_commit(c, no_space, cnt);
+ if (err)
+ goto out_free;
+ ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+ err = free_unused_idx_lebs(c);
+ if (err)
+ goto out;
+ }
+ destroy_old_idx(c);
+ memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
+
+ err = ubifs_save_dirty_idx_lnums(c);
+ if (err)
+ goto out;
+
+ spin_lock(&c->space_lock);
+ /*
+ * Although we have not finished committing yet, update size of the
+ * committed index ('c->old_idx_sz') and zero out the index growth
+ * budget. It is OK to do this now, because we've reserved all the
+ * space which is needed to commit the index, and it is save for the
+ * budgeting subsystem to assume the index is already committed,
+ * even though it is not.
+ */
+ c->old_idx_sz = c->calc_idx_sz;
+ c->budg_uncommitted_idx = 0;
+ spin_unlock(&c->space_lock);
+ mutex_unlock(&c->tnc_mutex);
+
+ dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
+ dbg_cmt("size of index %llu", c->calc_idx_sz);
+ return err;
+
+out_free:
+ free_idx_lebs(c);
+out:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * write_index - write index nodes.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the index nodes whose positions were laid out in the
+ * layout_in_empty_space function.
+ */
+static int write_index(struct ubifs_info *c)
+{
+ struct ubifs_idx_node *idx;
+ struct ubifs_znode *znode, *cnext;
+ int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
+ int avail, wlen, err, lnum_pos = 0;
+
+ cnext = c->enext;
+ if (!cnext)
+ return 0;
+
+ /*
+ * Always write index nodes to the index head so that index nodes and
+ * other types of nodes are never mixed in the same erase block.
+ */
+ lnum = c->ihead_lnum;
+ buf_offs = c->ihead_offs;
+
+ /* Allocate commit buffer */
+ buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
+ used = 0;
+ avail = buf_len;
+
+ /* Ensure there is enough room for first write */
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+ if (buf_offs + next_len > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
+ LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+
+ while (1) {
+ cond_resched();
+
+ znode = cnext;
+ idx = c->cbuf + used;
+
+ /* Make index node */
+ idx->ch.node_type = UBIFS_IDX_NODE;
+ idx->child_cnt = cpu_to_le16(znode->child_cnt);
+ idx->level = cpu_to_le16(znode->level);
+ for (i = 0; i < znode->child_cnt; i++) {
+ struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ key_write_idx(c, &zbr->key, &br->key);
+ br->lnum = cpu_to_le32(zbr->lnum);
+ br->offs = cpu_to_le32(zbr->offs);
+ br->len = cpu_to_le32(zbr->len);
+ if (!zbr->lnum || !zbr->len) {
+ ubifs_err("bad ref in znode");
+ dbg_dump_znode(c, znode);
+ if (zbr->znode)
+ dbg_dump_znode(c, zbr->znode);
+ }
+ }
+ len = ubifs_idx_node_sz(c, znode->child_cnt);
+ ubifs_prepare_node(c, idx, len, 0);
+
+ /* Determine the index node position */
+ if (lnum == -1) {
+ lnum = c->ilebs[lnum_pos++];
+ buf_offs = 0;
+ used = 0;
+ avail = buf_len;
+ }
+ offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ if (lnum != znode->lnum || offs != znode->offs ||
+ len != znode->len) {
+ ubifs_err("inconsistent znode posn");
+ return -EINVAL;
+ }
+#endif
+
+ /* Grab some stuff from znode while we still can */
+ cnext = znode->cnext;
+
+ ubifs_assert(ubifs_zn_dirty(znode));
+ ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+ /*
+ * It is important that other threads should see %DIRTY_ZNODE
+ * flag cleared before %COW_ZNODE. Specifically, it matters in
+ * the 'dirty_cow_znode()' function. This is the reason for the
+ * first barrier. Also, we want the bit changes to be seen to
+ * other threads ASAP, to avoid unnecesarry copying, which is
+ * the reason for the second barrier.
+ */
+ clear_bit(DIRTY_ZNODE, &znode->flags);
+ smp_mb__before_clear_bit();
+ clear_bit(COW_ZNODE, &znode->flags);
+ smp_mb__after_clear_bit();
+
+ /* Do not access znode from this point on */
+
+ /* Update buffer positions */
+ wlen = used + len;
+ used += ALIGN(len, 8);
+ avail -= ALIGN(len, 8);
+
+ /*
+ * Calculate the next index node length to see if there is
+ * enough room for it
+ */
+ if (cnext == c->cnext)
+ next_len = 0;
+ else
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+ if (c->min_io_size == 1) {
+ /*
+ * Write the prepared index node immediately if there is
+ * no minimum IO size
+ */
+ err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+ wlen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ buf_offs += ALIGN(wlen, 8);
+ if (next_len) {
+ used = 0;
+ avail = buf_len;
+ if (buf_offs + next_len > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum,
+ LPROPS_NC, 0, 0, LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+ continue;
+ }
+ } else {
+ int blen, nxt_offs = buf_offs + used + next_len;
+
+ if (next_len && nxt_offs <= c->leb_size) {
+ if (avail > 0)
+ continue;
+ else
+ blen = buf_len;
+ } else {
+ wlen = ALIGN(wlen, 8);
+ blen = ALIGN(wlen, c->min_io_size);
+ ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+ }
+ /*
+ * The buffer is full or there are no more znodes
+ * to do
+ */
+ err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+ blen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ buf_offs += blen;
+ if (next_len) {
+ if (nxt_offs > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum,
+ LPROPS_NC, 0, 0, LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+ used -= blen;
+ if (used < 0)
+ used = 0;
+ avail = buf_len - used;
+ memmove(c->cbuf, c->cbuf + blen, used);
+ continue;
+ }
+ }
+ break;
+ }
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+ ubifs_err("inconsistent ihead");
+ return -EINVAL;
+ }
+#endif
+
+ c->ihead_lnum = lnum;
+ c->ihead_offs = buf_offs;
+
+ return 0;
+}
+
+/**
+ * free_obsolete_znodes - free obsolete znodes.
+ * @c: UBIFS file-system description object
+ *
+ * At the end of commit end, obsolete znodes are freed.
+ */
+static void free_obsolete_znodes(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode, *cnext;
+
+ cnext = c->cnext;
+ do {
+ znode = cnext;
+ cnext = znode->cnext;
+ if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+ kfree(znode);
+ else {
+ znode->cnext = NULL;
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ }
+ } while (cnext != c->cnext);
+}
+
+/**
+ * return_gap_lebs - return LEBs used by the in-gap commit method.
+ * @c: UBIFS file-system description object
+ *
+ * This function clears the "taken" flag for the LEBs which were used by the
+ * "commit in-the-gaps" method.
+ */
+static int return_gap_lebs(struct ubifs_info *c)
+{
+ int *p, err;
+
+ if (!c->gap_lebs)
+ return 0;
+
+ dbg_cmt("");
+ for (p = c->gap_lebs; *p != -1; p++) {
+ err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_TAKEN, 0);
+ if (err)
+ return err;
+ }
+
+ kfree(c->gap_lebs);
+ c->gap_lebs = NULL;
+ return 0;
+}
+
+/**
+ * ubifs_tnc_end_commit - update the TNC for commit end.
+ * @c: UBIFS file-system description object
+ *
+ * Write the dirty znodes.
+ */
+int ubifs_tnc_end_commit(struct ubifs_info *c)
+{
+ int err;
+
+ if (!c->cnext)
+ return 0;
+
+ err = return_gap_lebs(c);
+ if (err)
+ return err;
+
+ err = write_index(c);
+ if (err)
+ return err;
+
+ mutex_lock(&c->tnc_mutex);
+
+ dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
+
+ free_obsolete_znodes(c);
+
+ c->cnext = NULL;
+ kfree(c->ilebs);
+ c->ilebs = NULL;
+
+ mutex_unlock(&c->tnc_mutex);
+
+ return 0;
+}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 00000000000..a25c1cc1f8d
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains miscelanious TNC-related functions shared betweend
+ * different files. This file does not form any logically separate TNC
+ * sub-system. The file was created because there is a lot of TNC code and
+ * putting it all in one file would make that file too big and unreadable.
+ */
+
+#include "ubifs.h"
+
+/**
+ * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
+ * @zr: root of the subtree to traverse
+ * @znode: previous znode
+ *
+ * This function implements levelorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+ struct ubifs_znode *znode)
+{
+ int level, iip, level_search = 0;
+ struct ubifs_znode *zn;
+
+ ubifs_assert(zr);
+
+ if (unlikely(!znode))
+ return zr;
+
+ if (unlikely(znode == zr)) {
+ if (znode->level == 0)
+ return NULL;
+ return ubifs_tnc_find_child(zr, 0);
+ }
+
+ level = znode->level;
+
+ iip = znode->iip;
+ while (1) {
+ ubifs_assert(znode->level <= zr->level);
+
+ /*
+ * First walk up until there is a znode with next branch to
+ * look at.
+ */
+ while (znode->parent != zr && iip >= znode->parent->child_cnt) {
+ znode = znode->parent;
+ iip = znode->iip;
+ }
+
+ if (unlikely(znode->parent == zr &&
+ iip >= znode->parent->child_cnt)) {
+ /* This level is done, switch to the lower one */
+ level -= 1;
+ if (level_search || level < 0)
+ /*
+ * We were already looking for znode at lower
+ * level ('level_search'). As we are here
+ * again, it just does not exist. Or all levels
+ * were finished ('level < 0').
+ */
+ return NULL;
+
+ level_search = 1;
+ iip = -1;
+ znode = ubifs_tnc_find_child(zr, 0);
+ ubifs_assert(znode);
+ }
+
+ /* Switch to the next index */
+ zn = ubifs_tnc_find_child(znode->parent, iip + 1);
+ if (!zn) {
+ /* No more children to look at, we have walk up */
+ iip = znode->parent->child_cnt;
+ continue;
+ }
+
+ /* Walk back down to the level we came from ('level') */
+ while (zn->level != level) {
+ znode = zn;
+ zn = ubifs_tnc_find_child(zn, 0);
+ if (!zn) {
+ /*
+ * This path is not too deep so it does not
+ * reach 'level'. Try next path.
+ */
+ iip = znode->iip;
+ break;
+ }
+ }
+
+ if (zn) {
+ ubifs_assert(zn->level >= 0);
+ return zn;
+ }
+ }
+}
+
+/**
+ * ubifs_search_zbranch - search znode branch.
+ * @c: UBIFS file-system description object
+ * @znode: znode to search in
+ * @key: key to search for
+ * @n: znode branch slot number is returned here
+ *
+ * This is a helper function which search branch with key @key in @znode using
+ * binary search. The result of the search may be:
+ * o exact match, then %1 is returned, and the slot number of the branch is
+ * stored in @n;
+ * o no exact match, then %0 is returned and the slot number of the left
+ * closest branch is returned in @n; the slot if all keys in this znode are
+ * greater than @key, then %-1 is returned in @n.
+ */
+int ubifs_search_zbranch(const struct ubifs_info *c,
+ const struct ubifs_znode *znode,
+ const union ubifs_key *key, int *n)
+{
+ int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
+ int uninitialized_var(cmp);
+ const struct ubifs_zbranch *zbr = &znode->zbranch[0];
+
+ ubifs_assert(end > beg);
+
+ while (end > beg) {
+ mid = (beg + end) >> 1;
+ cmp = keys_cmp(c, key, &zbr[mid].key);
+ if (cmp > 0)
+ beg = mid + 1;
+ else if (cmp < 0)
+ end = mid;
+ else {
+ *n = mid;
+ return 1;
+ }
+ }
+
+ *n = end - 1;
+
+ /* The insert point is after *n */
+ ubifs_assert(*n >= -1 && *n < znode->child_cnt);
+ if (*n == -1)
+ ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
+ else
+ ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
+ if (*n + 1 < znode->child_cnt)
+ ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
+
+ return 0;
+}
+
+/**
+ * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
+ * @znode: znode to start at (root of the sub-tree to traverse)
+ *
+ * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
+ * ignored.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
+{
+ if (unlikely(!znode))
+ return NULL;
+
+ while (znode->level > 0) {
+ struct ubifs_znode *child;
+
+ child = ubifs_tnc_find_child(znode, 0);
+ if (!child)
+ return znode;
+ znode = child;
+ }
+
+ return znode;
+}
+
+/**
+ * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
+ * @znode: previous znode
+ *
+ * This function implements postorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zn;
+
+ ubifs_assert(znode);
+ if (unlikely(!znode->parent))
+ return NULL;
+
+ /* Switch to the next index in the parent */
+ zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
+ if (!zn)
+ /* This is in fact the last child, return parent */
+ return znode->parent;
+
+ /* Go to the first znode in this new subtree */
+ return ubifs_tnc_postorder_first(zn);
+}
+
+/**
+ * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
+ * @znode: znode defining subtree to destroy
+ *
+ * This function destroys subtree of the TNC tree. Returns number of clean
+ * znodes in the subtree.
+ */
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
+ long clean_freed = 0;
+ int n;
+
+ ubifs_assert(zn);
+ while (1) {
+ for (n = 0; n < zn->child_cnt; n++) {
+ if (!zn->zbranch[n].znode)
+ continue;
+
+ if (zn->level > 0 &&
+ !ubifs_zn_dirty(zn->zbranch[n].znode))
+ clean_freed += 1;
+
+ cond_resched();
+ kfree(zn->zbranch[n].znode);
+ }
+
+ if (zn == znode) {
+ if (!ubifs_zn_dirty(zn))
+ clean_freed += 1;
+ kfree(zn);
+ return clean_freed;
+ }
+
+ zn = ubifs_tnc_postorder_next(zn);
+ }
+}
+
+/**
+ * read_znode - read an indexing node from flash and fill znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB of the indexing node to read
+ * @offs: node offset
+ * @len: node length
+ * @znode: znode to read to
+ *
+ * This function reads an indexing node from the flash media and fills znode
+ * with the read data. Returns zero in case of success and a negative error
+ * code in case of failure. The read indexing node is validated and if anything
+ * is wrong with it, this function prints complaint messages and returns
+ * %-EINVAL.
+ */
+static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+ struct ubifs_znode *znode)
+{
+ int i, err, type, cmp;
+ struct ubifs_idx_node *idx;
+
+ idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+ if (!idx)
+ return -ENOMEM;
+
+ err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+ if (err < 0) {
+ kfree(idx);
+ return err;
+ }
+
+ znode->child_cnt = le16_to_cpu(idx->child_cnt);
+ znode->level = le16_to_cpu(idx->level);
+
+ dbg_tnc("LEB %d:%d, level %d, %d branch",
+ lnum, offs, znode->level, znode->child_cnt);
+
+ if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
+ dbg_err("current fanout %d, branch count %d",
+ c->fanout, znode->child_cnt);
+ dbg_err("max levels %d, znode level %d",
+ UBIFS_MAX_LEVELS, znode->level);
+ err = 1;
+ goto out_dump;
+ }
+
+ for (i = 0; i < znode->child_cnt; i++) {
+ const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ key_read(c, &br->key, &zbr->key);
+ zbr->lnum = le32_to_cpu(br->lnum);
+ zbr->offs = le32_to_cpu(br->offs);
+ zbr->len = le32_to_cpu(br->len);
+ zbr->znode = NULL;
+
+ /* Validate branch */
+
+ if (zbr->lnum < c->main_first ||
+ zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
+ zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
+ dbg_err("bad branch %d", i);
+ err = 2;
+ goto out_dump;
+ }
+
+ switch (key_type(c, &zbr->key)) {
+ case UBIFS_INO_KEY:
+ case UBIFS_DATA_KEY:
+ case UBIFS_DENT_KEY:
+ case UBIFS_XENT_KEY:
+ break;
+ default:
+ dbg_msg("bad key type at slot %d: %s", i,
+ DBGKEY(&zbr->key));
+ err = 3;
+ goto out_dump;
+ }
+
+ if (znode->level)
+ continue;
+
+ type = key_type(c, &zbr->key);
+ if (c->ranges[type].max_len == 0) {
+ if (zbr->len != c->ranges[type].len) {
+ dbg_err("bad target node (type %d) length (%d)",
+ type, zbr->len);
+ dbg_err("have to be %d", c->ranges[type].len);
+ err = 4;
+ goto out_dump;
+ }
+ } else if (zbr->len < c->ranges[type].min_len ||
+ zbr->len > c->ranges[type].max_len) {
+ dbg_err("bad target node (type %d) length (%d)",
+ type, zbr->len);
+ dbg_err("have to be in range of %d-%d",
+ c->ranges[type].min_len,
+ c->ranges[type].max_len);
+ err = 5;
+ goto out_dump;
+ }
+ }
+
+ /*
+ * Ensure that the next key is greater or equivalent to the
+ * previous one.
+ */
+ for (i = 0; i < znode->child_cnt - 1; i++) {
+ const union ubifs_key *key1, *key2;
+
+ key1 = &znode->zbranch[i].key;
+ key2 = &znode->zbranch[i + 1].key;
+
+ cmp = keys_cmp(c, key1, key2);
+ if (cmp > 0) {
+ dbg_err("bad key order (keys %d and %d)", i, i + 1);
+ err = 6;
+ goto out_dump;
+ } else if (cmp == 0 && !is_hash_key(c, key1)) {
+ /* These can only be keys with colliding hash */
+ dbg_err("keys %d and %d are not hashed but equivalent",
+ i, i + 1);
+ err = 7;
+ goto out_dump;
+ }
+ }
+
+ kfree(idx);
+ return 0;
+
+out_dump:
+ ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+ dbg_dump_node(c, idx);
+ kfree(idx);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_load_znode - load znode to TNC cache.
+ * @c: UBIFS file-system description object
+ * @zbr: znode branch
+ * @parent: znode's parent
+ * @iip: index in parent
+ *
+ * This function loads znode pointed to by @zbr into the TNC cache and
+ * returns pointer to it in case of success and a negative error code in case
+ * of failure.
+ */
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr,
+ struct ubifs_znode *parent, int iip)
+{
+ int err;
+ struct ubifs_znode *znode;
+
+ ubifs_assert(!zbr->znode);
+ /*
+ * A slab cache is not presently used for znodes because the znode size
+ * depends on the fanout which is stored in the superblock.
+ */
+ znode = kzalloc(c->max_znode_sz, GFP_NOFS);
+ if (!znode)
+ return ERR_PTR(-ENOMEM);
+
+ err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+ if (err)
+ goto out;
+
+ atomic_long_inc(&c->clean_zn_cnt);
+
+ /*
+ * Increment the global clean znode counter as well. It is OK that
+ * global and per-FS clean znode counters may be inconsistent for some
+ * short time (because we might be preempted at this point), the global
+ * one is only used in shrinker.
+ */
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+
+ zbr->znode = znode;
+ znode->parent = parent;
+ znode->time = get_seconds();
+ znode->iip = iip;
+
+ return znode;
+
+out:
+ kfree(znode);
+ return ERR_PTR(err);
+}
+
+/**
+ * ubifs_tnc_read_node - read a leaf node from the flash media.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a node defined by @zbr from the flash media. Returns
+ * zero in case of success or a negative negative error code in case of
+ * failure.
+ */
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
+{
+ union ubifs_key key1, *key = &zbr->key;
+ int err, type = key_type(c, key);
+ struct ubifs_wbuf *wbuf;
+
+ /*
+ * 'zbr' has to point to on-flash node. The node may sit in a bud and
+ * may even be in a write buffer, so we have to take care about this.
+ */
+ wbuf = ubifs_get_wbuf(c, zbr->lnum);
+ if (wbuf)
+ err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
+ zbr->lnum, zbr->offs);
+ else
+ err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
+ zbr->offs);
+
+ if (err) {
+ dbg_tnc("key %s", DBGKEY(key));
+ return err;
+ }
+
+ /* Make sure the key of the read node is correct */
+ key_read(c, key, &key1);
+ if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+ ubifs_err("bad key in node at LEB %d:%d",
+ zbr->lnum, zbr->offs);
+ dbg_tnc("looked for key %s found node's key %s",
+ DBGKEY(key), DBGKEY1(&key1));
+ dbg_dump_node(c, node);
+ return -EINVAL;
+ }
+
+ return 0;
+}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 00000000000..0cc7da9bed4
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,745 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file describes UBIFS on-flash format and contains definitions of all the
+ * relevant data structures and constants.
+ *
+ * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
+ * with the UBIFS node magic number and have the same common header. Nodes
+ * always sit at 8-byte aligned positions on the media and node header sizes are
+ * also 8-byte aligned (except for the indexing node and the padding node).
+ */
+
+#ifndef __UBIFS_MEDIA_H__
+#define __UBIFS_MEDIA_H__
+
+/* UBIFS node magic number (must not have the padding byte first or last) */
+#define UBIFS_NODE_MAGIC 0x06101831
+
+/* UBIFS on-flash format version */
+#define UBIFS_FORMAT_VERSION 4
+
+/* Minimum logical eraseblock size in bytes */
+#define UBIFS_MIN_LEB_SZ (15*1024)
+
+/* Initial CRC32 value used when calculating CRC checksums */
+#define UBIFS_CRC32_INIT 0xFFFFFFFFU
+
+/*
+ * UBIFS does not try to compress data if its length is less than the below
+ * constant.
+ */
+#define UBIFS_MIN_COMPR_LEN 128
+
+/* Root inode number */
+#define UBIFS_ROOT_INO 1
+
+/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
+#define UBIFS_FIRST_INO 64
+
+/*
+ * Maximum file name and extended attribute length (must be a multiple of 8,
+ * minus 1).
+ */
+#define UBIFS_MAX_NLEN 255
+
+/* Maximum number of data journal heads */
+#define UBIFS_MAX_JHEADS 1
+
+/*
+ * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
+ * which means that it does not treat the underlying media as consisting of
+ * blocks like in case of hard drives. Do not be confused. UBIFS block is just
+ * the maximum amount of data which one data node can have or which can be
+ * attached to an inode node.
+ */
+#define UBIFS_BLOCK_SIZE 4096
+#define UBIFS_BLOCK_SHIFT 12
+#define UBIFS_BLOCK_MASK 0x00000FFF
+
+/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
+#define UBIFS_PADDING_BYTE 0xCE
+
+/* Maximum possible key length */
+#define UBIFS_MAX_KEY_LEN 16
+
+/* Key length ("simple" format) */
+#define UBIFS_SK_LEN 8
+
+/* Minimum index tree fanout */
+#define UBIFS_MIN_FANOUT 2
+
+/* Maximum number of levels in UBIFS indexing B-tree */
+#define UBIFS_MAX_LEVELS 512
+
+/* Maximum amount of data attached to an inode in bytes */
+#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
+
+/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
+#define UBIFS_LPT_FANOUT 4
+#define UBIFS_LPT_FANOUT_SHIFT 2
+
+/* LEB Properties Tree bit field sizes */
+#define UBIFS_LPT_CRC_BITS 16
+#define UBIFS_LPT_CRC_BYTES 2
+#define UBIFS_LPT_TYPE_BITS 4
+
+/* The key is always at the same position in all keyed nodes */
+#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+
+/*
+ * LEB Properties Tree node types.
+ *
+ * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
+ * UBIFS_LPT_NNODE: LPT internal node
+ * UBIFS_LPT_LTAB: LPT's own lprops table
+ * UBIFS_LPT_LSAVE: LPT's save table (big model only)
+ * UBIFS_LPT_NODE_CNT: count of LPT node types
+ * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
+ */
+enum {
+ UBIFS_LPT_PNODE,
+ UBIFS_LPT_NNODE,
+ UBIFS_LPT_LTAB,
+ UBIFS_LPT_LSAVE,
+ UBIFS_LPT_NODE_CNT,
+ UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
+};
+
+/*
+ * UBIFS inode types.
+ *
+ * UBIFS_ITYPE_REG: regular file
+ * UBIFS_ITYPE_DIR: directory
+ * UBIFS_ITYPE_LNK: soft link
+ * UBIFS_ITYPE_BLK: block device node
+ * UBIFS_ITYPE_CHR: character device node
+ * UBIFS_ITYPE_FIFO: fifo
+ * UBIFS_ITYPE_SOCK: socket
+ * UBIFS_ITYPES_CNT: count of supported file types
+ */
+enum {
+ UBIFS_ITYPE_REG,
+ UBIFS_ITYPE_DIR,
+ UBIFS_ITYPE_LNK,
+ UBIFS_ITYPE_BLK,
+ UBIFS_ITYPE_CHR,
+ UBIFS_ITYPE_FIFO,
+ UBIFS_ITYPE_SOCK,
+ UBIFS_ITYPES_CNT,
+};
+
+/*
+ * Supported key hash functions.
+ *
+ * UBIFS_KEY_HASH_R5: R5 hash
+ * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
+ */
+enum {
+ UBIFS_KEY_HASH_R5,
+ UBIFS_KEY_HASH_TEST,
+};
+
+/*
+ * Supported key formats.
+ *
+ * UBIFS_SIMPLE_KEY_FMT: simple key format
+ */
+enum {
+ UBIFS_SIMPLE_KEY_FMT,
+};
+
+/*
+ * The simple key format uses 29 bits for storing UBIFS block number and hash
+ * value.
+ */
+#define UBIFS_S_KEY_BLOCK_BITS 29
+#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
+#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS
+#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK
+
+/*
+ * Key types.
+ *
+ * UBIFS_INO_KEY: inode node key
+ * UBIFS_DATA_KEY: data node key
+ * UBIFS_DENT_KEY: directory entry node key
+ * UBIFS_XENT_KEY: extended attribute entry key
+ * UBIFS_KEY_TYPES_CNT: number of supported key types
+ */
+enum {
+ UBIFS_INO_KEY,
+ UBIFS_DATA_KEY,
+ UBIFS_DENT_KEY,
+ UBIFS_XENT_KEY,
+ UBIFS_KEY_TYPES_CNT,
+};
+
+/* Count of LEBs reserved for the superblock area */
+#define UBIFS_SB_LEBS 1
+/* Count of LEBs reserved for the master area */
+#define UBIFS_MST_LEBS 2
+
+/* First LEB of the superblock area */
+#define UBIFS_SB_LNUM 0
+/* First LEB of the master area */
+#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
+/* First LEB of the log area */
+#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
+
+/*
+ * The below constants define the absolute minimum values for various UBIFS
+ * media areas. Many of them actually depend of flash geometry and the FS
+ * configuration (number of journal heads, orphan LEBs, etc). This means that
+ * the smallest volume size which can be used for UBIFS cannot be pre-defined
+ * by these constants. The file-system that meets the below limitation will not
+ * necessarily mount. UBIFS does run-time calculations and validates the FS
+ * size.
+ */
+
+/* Minimum number of logical eraseblocks in the log */
+#define UBIFS_MIN_LOG_LEBS 2
+/* Minimum number of bud logical eraseblocks (one for each head) */
+#define UBIFS_MIN_BUD_LEBS 3
+/* Minimum number of journal logical eraseblocks */
+#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
+/* Minimum number of LPT area logical eraseblocks */
+#define UBIFS_MIN_LPT_LEBS 2
+/* Minimum number of orphan area logical eraseblocks */
+#define UBIFS_MIN_ORPH_LEBS 1
+/*
+ * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * for GC, 1 for deletions, and at least 1 for committed data).
+ */
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+
+/* Minimum number of logical eraseblocks */
+#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
+ UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
+ UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
+
+/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_CH_SZ sizeof(struct ubifs_ch)
+#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node)
+#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
+#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
+#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
+#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node)
+#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node)
+#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node)
+#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node)
+#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node)
+#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node)
+#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+/* Extended attribute entry nodes are identical to directory entry nodes */
+#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
+/* Only this does not have to be multiple of 8 bytes */
+#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch)
+
+/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
+#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
+#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
+#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ
+
+/* The largest UBIFS node */
+#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+
+/*
+ * On-flash inode flags.
+ *
+ * UBIFS_COMPR_FL: use compression for this inode
+ * UBIFS_SYNC_FL: I/O on this inode has to be synchronous
+ * UBIFS_IMMUTABLE_FL: inode is immutable
+ * UBIFS_APPEND_FL: writes to the inode may only append data
+ * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
+ * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ *
+ * Note, these are on-flash flags which correspond to ioctl flags
+ * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
+ * have to be the same.
+ */
+enum {
+ UBIFS_COMPR_FL = 0x01,
+ UBIFS_SYNC_FL = 0x02,
+ UBIFS_IMMUTABLE_FL = 0x04,
+ UBIFS_APPEND_FL = 0x08,
+ UBIFS_DIRSYNC_FL = 0x10,
+ UBIFS_XATTR_FL = 0x20,
+};
+
+/* Inode flag bits used by UBIFS */
+#define UBIFS_FL_MASK 0x0000001F
+
+/*
+ * UBIFS compression algorithms.
+ *
+ * UBIFS_COMPR_NONE: no compression
+ * UBIFS_COMPR_LZO: LZO compression
+ * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_TYPES_CNT: count of supported compression types
+ */
+enum {
+ UBIFS_COMPR_NONE,
+ UBIFS_COMPR_LZO,
+ UBIFS_COMPR_ZLIB,
+ UBIFS_COMPR_TYPES_CNT,
+};
+
+/*
+ * UBIFS node types.
+ *
+ * UBIFS_INO_NODE: inode node
+ * UBIFS_DATA_NODE: data node
+ * UBIFS_DENT_NODE: directory entry node
+ * UBIFS_XENT_NODE: extended attribute node
+ * UBIFS_TRUN_NODE: truncation node
+ * UBIFS_PAD_NODE: padding node
+ * UBIFS_SB_NODE: superblock node
+ * UBIFS_MST_NODE: master node
+ * UBIFS_REF_NODE: LEB reference node
+ * UBIFS_IDX_NODE: index node
+ * UBIFS_CS_NODE: commit start node
+ * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_NODE_TYPES_CNT: count of supported node types
+ *
+ * Note, we index arrays by these numbers, so keep them low and contiguous.
+ * Node type constants for inodes, direntries and so on have to be the same as
+ * corresponding key type constants.
+ */
+enum {
+ UBIFS_INO_NODE,
+ UBIFS_DATA_NODE,
+ UBIFS_DENT_NODE,
+ UBIFS_XENT_NODE,
+ UBIFS_TRUN_NODE,
+ UBIFS_PAD_NODE,
+ UBIFS_SB_NODE,
+ UBIFS_MST_NODE,
+ UBIFS_REF_NODE,
+ UBIFS_IDX_NODE,
+ UBIFS_CS_NODE,
+ UBIFS_ORPH_NODE,
+ UBIFS_NODE_TYPES_CNT,
+};
+
+/*
+ * Master node flags.
+ *
+ * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
+ * UBIFS_MST_NO_ORPHS: no orphan inodes present
+ * UBIFS_MST_RCVRY: written by recovery
+ */
+enum {
+ UBIFS_MST_DIRTY = 1,
+ UBIFS_MST_NO_ORPHS = 2,
+ UBIFS_MST_RCVRY = 4,
+};
+
+/*
+ * Node group type (used by recovery to recover whole group or none).
+ *
+ * UBIFS_NO_NODE_GROUP: this node is not part of a group
+ * UBIFS_IN_NODE_GROUP: this node is a part of a group
+ * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
+ */
+enum {
+ UBIFS_NO_NODE_GROUP = 0,
+ UBIFS_IN_NODE_GROUP,
+ UBIFS_LAST_OF_NODE_GROUP,
+};
+
+/*
+ * Superblock flags.
+ *
+ * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ */
+enum {
+ UBIFS_FLG_BIGLPT = 0x02,
+};
+
+/**
+ * struct ubifs_ch - common header node.
+ * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
+ * @crc: CRC-32 checksum of the node header
+ * @sqnum: sequence number
+ * @len: full node length
+ * @node_type: node type
+ * @group_type: node group type
+ * @padding: reserved for future, zeroes
+ *
+ * Every UBIFS node starts with this common part. If the node has a key, the
+ * key always goes next.
+ */
+struct ubifs_ch {
+ __le32 magic;
+ __le32 crc;
+ __le64 sqnum;
+ __le32 len;
+ __u8 node_type;
+ __u8 group_type;
+ __u8 padding[2];
+} __attribute__ ((packed));
+
+/**
+ * union ubifs_dev_desc - device node descriptor.
+ * @new: new type device descriptor
+ * @huge: huge type device descriptor
+ *
+ * This data structure describes major/minor numbers of a device node. In an
+ * inode is a device node then its data contains an object of this type. UBIFS
+ * uses standard Linux "new" and "huge" device node encodings.
+ */
+union ubifs_dev_desc {
+ __le32 new;
+ __le64 huge;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_ino_node - inode node.
+ * @ch: common header
+ * @key: node key
+ * @creat_sqnum: sequence number at time of creation
+ * @size: inode size in bytes (amount of uncompressed data)
+ * @atime_sec: access time seconds
+ * @ctime_sec: creation time seconds
+ * @mtime_sec: modification time seconds
+ * @atime_nsec: access time nanoseconds
+ * @ctime_nsec: creation time nanoseconds
+ * @mtime_nsec: modification time nanoseconds
+ * @nlink: number of hard links
+ * @uid: owner ID
+ * @gid: group ID
+ * @mode: access flags
+ * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
+ * @data_len: inode data length
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @padding1: reserved for future, zeroes
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ * this inode
+ * @compr_type: compression type used for this inode
+ * @padding2: reserved for future, zeroes
+ * @data: data attached to the inode
+ *
+ * Note, even though inode compression type is defined by @compr_type, some
+ * nodes of this inode may be compressed with different compressor - this
+ * happens if compression type is changed while the inode already has data
+ * nodes. But @compr_type will be use for further writes to the inode.
+ *
+ * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
+ * the padding fields.
+ */
+struct ubifs_ino_node {
+ struct ubifs_ch ch;
+ __u8 key[UBIFS_MAX_KEY_LEN];
+ __le64 creat_sqnum;
+ __le64 size;
+ __le64 atime_sec;
+ __le64 ctime_sec;
+ __le64 mtime_sec;
+ __le32 atime_nsec;
+ __le32 ctime_nsec;
+ __le32 mtime_nsec;
+ __le32 nlink;
+ __le32 uid;
+ __le32 gid;
+ __le32 mode;
+ __le32 flags;
+ __le32 data_len;
+ __le32 xattr_cnt;
+ __le32 xattr_size;
+ __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
+ __le32 xattr_names;
+ __le16 compr_type;
+ __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
+ __u8 data[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_dent_node - directory entry node.
+ * @ch: common header
+ * @key: node key
+ * @inum: target inode number
+ * @padding1: reserved for future, zeroes
+ * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
+ * @nlen: name length
+ * @padding2: reserved for future, zeroes
+ * @name: zero-terminated name
+ *
+ * Note, do not forget to amend 'zero_dent_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_dent_node {
+ struct ubifs_ch ch;
+ __u8 key[UBIFS_MAX_KEY_LEN];
+ __le64 inum;
+ __u8 padding1;
+ __u8 type;
+ __le16 nlen;
+ __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+ __u8 name[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_data_node - data node.
+ * @ch: common header
+ * @key: node key
+ * @size: uncompressed data size in bytes
+ * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
+ * @padding: reserved for future, zeroes
+ * @data: data
+ *
+ * Note, do not forget to amend 'zero_data_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_data_node {
+ struct ubifs_ch ch;
+ __u8 key[UBIFS_MAX_KEY_LEN];
+ __le32 size;
+ __le16 compr_type;
+ __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+ __u8 data[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_trun_node - truncation node.
+ * @ch: common header
+ * @inum: truncated inode number
+ * @padding: reserved for future, zeroes
+ * @old_size: size before truncation
+ * @new_size: size after truncation
+ *
+ * This node exists only in the journal and never goes to the main area. Note,
+ * do not forget to amend 'zero_trun_node_unused()' function when changing the
+ * padding fields.
+ */
+struct ubifs_trun_node {
+ struct ubifs_ch ch;
+ __le32 inum;
+ __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
+ __le64 old_size;
+ __le64 new_size;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_pad_node - padding node.
+ * @ch: common header
+ * @pad_len: how many bytes after this node are unused (because padded)
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_pad_node {
+ struct ubifs_ch ch;
+ __le32 pad_len;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_sb_node - superblock node.
+ * @ch: common header
+ * @padding: reserved for future, zeroes
+ * @key_hash: type of hash function used in keys
+ * @key_fmt: format of the key
+ * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
+ * @min_io_size: minimal input/output unit size
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_cnt: count of LEBs used by file-system
+ * @max_leb_cnt: maximum count of LEBs used by file-system
+ * @max_bud_bytes: maximum amount of data stored in buds
+ * @log_lebs: log size in logical eraseblocks
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @orph_lebs: number of LEBs used for recording orphans
+ * @jhead_cnt: count of journal heads
+ * @fanout: tree fanout (max. number of links per indexing node)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @fmt_version: UBIFS on-flash format version
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @padding1: reserved for future, zeroes
+ * @rp_uid: reserve pool UID
+ * @rp_gid: reserve pool GID
+ * @rp_size: size of the reserved pool in bytes
+ * @padding2: reserved for future, zeroes
+ * @time_gran: time granularity in nanoseconds
+ * @uuid: UUID generated when the file system image was created
+ */
+struct ubifs_sb_node {
+ struct ubifs_ch ch;
+ __u8 padding[2];
+ __u8 key_hash;
+ __u8 key_fmt;
+ __le32 flags;
+ __le32 min_io_size;
+ __le32 leb_size;
+ __le32 leb_cnt;
+ __le32 max_leb_cnt;
+ __le64 max_bud_bytes;
+ __le32 log_lebs;
+ __le32 lpt_lebs;
+ __le32 orph_lebs;
+ __le32 jhead_cnt;
+ __le32 fanout;
+ __le32 lsave_cnt;
+ __le32 fmt_version;
+ __le16 default_compr;
+ __u8 padding1[2];
+ __le32 rp_uid;
+ __le32 rp_gid;
+ __le64 rp_size;
+ __le32 time_gran;
+ __u8 uuid[16];
+ __u8 padding2[3972];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_mst_node - master node.
+ * @ch: common header
+ * @highest_inum: highest inode number in the committed index
+ * @cmt_no: commit number
+ * @flags: various flags (%UBIFS_MST_DIRTY, etc)
+ * @log_lnum: start of the log
+ * @root_lnum: LEB number of the root indexing node
+ * @root_offs: offset within @root_lnum
+ * @root_len: root indexing node length
+ * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
+ * not reserved and should be reserved on mount)
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @index_size: size of index on flash
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @lpt_lnum: LEB number of LPT root nnode
+ * @lpt_offs: offset of LPT root nnode
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @lsave_lnum: LEB number of LPT's save table (big model only)
+ * @lsave_offs: offset of LPT's save table (big model only)
+ * @lscan_lnum: LEB number of last LPT scan
+ * @empty_lebs: number of empty logical eraseblocks
+ * @idx_lebs: number of indexing logical eraseblocks
+ * @leb_cnt: count of LEBs used by file-system
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_mst_node {
+ struct ubifs_ch ch;
+ __le64 highest_inum;
+ __le64 cmt_no;
+ __le32 flags;
+ __le32 log_lnum;
+ __le32 root_lnum;
+ __le32 root_offs;
+ __le32 root_len;
+ __le32 gc_lnum;
+ __le32 ihead_lnum;
+ __le32 ihead_offs;
+ __le64 index_size;
+ __le64 total_free;
+ __le64 total_dirty;
+ __le64 total_used;
+ __le64 total_dead;
+ __le64 total_dark;
+ __le32 lpt_lnum;
+ __le32 lpt_offs;
+ __le32 nhead_lnum;
+ __le32 nhead_offs;
+ __le32 ltab_lnum;
+ __le32 ltab_offs;
+ __le32 lsave_lnum;
+ __le32 lsave_offs;
+ __le32 lscan_lnum;
+ __le32 empty_lebs;
+ __le32 idx_lebs;
+ __le32 leb_cnt;
+ __u8 padding[344];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_ref_node - logical eraseblock reference node.
+ * @ch: common header
+ * @lnum: the referred logical eraseblock number
+ * @offs: start offset in the referred LEB
+ * @jhead: journal head number
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_ref_node {
+ struct ubifs_ch ch;
+ __le32 lnum;
+ __le32 offs;
+ __le32 jhead;
+ __u8 padding[28];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_branch - key/reference/length branch
+ * @lnum: LEB number of the target node
+ * @offs: offset within @lnum
+ * @len: target node length
+ * @key: key
+ */
+struct ubifs_branch {
+ __le32 lnum;
+ __le32 offs;
+ __le32 len;
+ __u8 key[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_idx_node - indexing node.
+ * @ch: common header
+ * @child_cnt: number of child index nodes
+ * @level: tree level
+ * @branches: LEB number / offset / length / key branches
+ */
+struct ubifs_idx_node {
+ struct ubifs_ch ch;
+ __le16 child_cnt;
+ __le16 level;
+ __u8 branches[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_cs_node - commit start node.
+ * @ch: common header
+ * @cmt_no: commit number
+ */
+struct ubifs_cs_node {
+ struct ubifs_ch ch;
+ __le64 cmt_no;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_orph_node - orphan node.
+ * @ch: common header
+ * @cmt_no: commit number (also top bit is set on the last node of the commit)
+ * @inos: inode numbers of orphans
+ */
+struct ubifs_orph_node {
+ struct ubifs_ch ch;
+ __le64 cmt_no;
+ __le64 inos[];
+} __attribute__ ((packed));
+
+#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 00000000000..e4f89f27182
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1649 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/* Implementation version 0.7 */
+
+#ifndef __UBIFS_H__
+#define __UBIFS_H__
+
+#include <asm/div64.h>
+#include <linux/statfs.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/mtd/ubi.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include "ubifs-media.h"
+
+/* Version of this UBIFS implementation */
+#define UBIFS_VERSION 1
+
+/* Normal UBIFS messages */
+#define ubifs_msg(fmt, ...) \
+ printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+/* UBIFS error messages */
+#define ubifs_err(fmt, ...) \
+ printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__)
+/* UBIFS warning messages */
+#define ubifs_warn(fmt, ...) \
+ printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
+ current->pid, __func__, ##__VA_ARGS__)
+
+/* UBIFS file system VFS magic number */
+#define UBIFS_SUPER_MAGIC 0x24051905
+
+/* Number of UBIFS blocks per VFS page */
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+
+/* "File system end of life" sequence number watermark */
+#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
+#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
+
+/* Minimum amount of data UBIFS writes to the flash */
+#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
+
+/*
+ * Currently we do not support inode number overlapping and re-using, so this
+ * watermark defines dangerous inode number level. This should be fixed later,
+ * although it is difficult to exceed current limit. Another option is to use
+ * 64-bit inode numbers, but this means more overhead.
+ */
+#define INUM_WARN_WATERMARK 0xFFF00000
+#define INUM_WATERMARK 0xFFFFFF00
+
+/* Largest key size supported in this implementation */
+#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
+
+/* Maximum number of entries in each LPT (LEB category) heap */
+#define LPT_HEAP_SZ 256
+
+/*
+ * Background thread name pattern. The numbers are UBI device and volume
+ * numbers.
+ */
+#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
+
+/* Default write-buffer synchronization timeout (5 secs) */
+#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+
+/* Maximum possible inode number (only 32-bit inodes are supported now) */
+#define MAX_INUM 0xFFFFFFFF
+
+/* Number of non-data journal heads */
+#define NONDATA_JHEADS_CNT 2
+
+/* Garbage collector head */
+#define GCHD 0
+/* Base journal head number */
+#define BASEHD 1
+/* First "general purpose" journal head */
+#define DATAHD 2
+
+/* 'No change' value for 'ubifs_change_lp()' */
+#define LPROPS_NC 0x80000001
+
+/*
+ * There is no notion of truncation key because truncation nodes do not exist
+ * in TNC. However, when replaying, it is handy to introduce fake "truncation"
+ * keys for truncation nodes because the code becomes simpler. So we define
+ * %UBIFS_TRUN_KEY type.
+ */
+#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+
+/*
+ * How much a directory entry/extended attribute entry adds to the parent/host
+ * inode.
+ */
+#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
+
+/* How much an extended attribute adds to the host inode */
+#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
+
+/*
+ * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
+ * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
+ * considered "young". This is used by shrinker when selecting znode to trim
+ * off.
+ */
+#define OLD_ZNODE_AGE 20
+#define YOUNG_ZNODE_AGE 5
+
+/*
+ * Some compressors, like LZO, may end up with more data then the input buffer.
+ * So UBIFS always allocates larger output buffer, to be sure the compressor
+ * will not corrupt memory in case of worst case compression.
+ */
+#define WORST_COMPR_FACTOR 2
+
+/* Maximum expected tree height for use by bottom_up_buf */
+#define BOTTOM_UP_HEIGHT 64
+
+/*
+ * Lockdep classes for UBIFS inode @ui_mutex.
+ */
+enum {
+ WB_MUTEX_1 = 0,
+ WB_MUTEX_2 = 1,
+ WB_MUTEX_3 = 2,
+};
+
+/*
+ * Znode flags (actually, bit numbers which store the flags).
+ *
+ * DIRTY_ZNODE: znode is dirty
+ * COW_ZNODE: znode is being committed and a new instance of this znode has to
+ * be created before changing this znode
+ * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
+ * still in the commit list and the ongoing commit operation
+ * will commit it, and delete this znode after it is done
+ */
+enum {
+ DIRTY_ZNODE = 0,
+ COW_ZNODE = 1,
+ OBSOLETE_ZNODE = 2,
+};
+
+/*
+ * Commit states.
+ *
+ * COMMIT_RESTING: commit is not wanted
+ * COMMIT_BACKGROUND: background commit has been requested
+ * COMMIT_REQUIRED: commit is required
+ * COMMIT_RUNNING_BACKGROUND: background commit is running
+ * COMMIT_RUNNING_REQUIRED: commit is running and it is required
+ * COMMIT_BROKEN: commit failed
+ */
+enum {
+ COMMIT_RESTING = 0,
+ COMMIT_BACKGROUND,
+ COMMIT_REQUIRED,
+ COMMIT_RUNNING_BACKGROUND,
+ COMMIT_RUNNING_REQUIRED,
+ COMMIT_BROKEN,
+};
+
+/*
+ * 'ubifs_scan_a_node()' return values.
+ *
+ * SCANNED_GARBAGE: scanned garbage
+ * SCANNED_EMPTY_SPACE: scanned empty space
+ * SCANNED_A_NODE: scanned a valid node
+ * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
+ * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
+ *
+ * Greater than zero means: 'scanned that number of padding bytes'
+ */
+enum {
+ SCANNED_GARBAGE = 0,
+ SCANNED_EMPTY_SPACE = -1,
+ SCANNED_A_NODE = -2,
+ SCANNED_A_CORRUPT_NODE = -3,
+ SCANNED_A_BAD_PAD_NODE = -4,
+};
+
+/*
+ * LPT cnode flag bits.
+ *
+ * DIRTY_CNODE: cnode is dirty
+ * COW_CNODE: cnode is being committed and must be copied before writing
+ * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
+ * so it can (and must) be freed when the commit is finished
+ */
+enum {
+ DIRTY_CNODE = 0,
+ COW_CNODE = 1,
+ OBSOLETE_CNODE = 2,
+};
+
+/*
+ * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
+ *
+ * LTAB_DIRTY: ltab node is dirty
+ * LSAVE_DIRTY: lsave node is dirty
+ */
+enum {
+ LTAB_DIRTY = 1,
+ LSAVE_DIRTY = 2,
+};
+
+/*
+ * Return codes used by the garbage collector.
+ * @LEB_FREED: the logical eraseblock was freed and is ready to use
+ * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
+ * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
+ */
+enum {
+ LEB_FREED,
+ LEB_FREED_IDX,
+ LEB_RETAINED,
+};
+
+/**
+ * struct ubifs_old_idx - index node obsoleted since last commit start.
+ * @rb: rb-tree node
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ */
+struct ubifs_old_idx {
+ struct rb_node rb;
+ int lnum;
+ int offs;
+};
+
+/* The below union makes it easier to deal with keys */
+union ubifs_key {
+ uint8_t u8[CUR_MAX_KEY_LEN];
+ uint32_t u32[CUR_MAX_KEY_LEN/4];
+ uint64_t u64[CUR_MAX_KEY_LEN/8];
+ __le32 j32[CUR_MAX_KEY_LEN/4];
+};
+
+/**
+ * struct ubifs_scan_node - UBIFS scanned node information.
+ * @list: list of scanned nodes
+ * @key: key of node scanned (if it has one)
+ * @sqnum: sequence number
+ * @type: type of node scanned
+ * @offs: offset with LEB of node scanned
+ * @len: length of node scanned
+ * @node: raw node
+ */
+struct ubifs_scan_node {
+ struct list_head list;
+ union ubifs_key key;
+ unsigned long long sqnum;
+ int type;
+ int offs;
+ int len;
+ void *node;
+};
+
+/**
+ * struct ubifs_scan_leb - UBIFS scanned LEB information.
+ * @lnum: logical eraseblock number
+ * @nodes_cnt: number of nodes scanned
+ * @nodes: list of struct ubifs_scan_node
+ * @endpt: end point (and therefore the start of empty space)
+ * @ecc: read returned -EBADMSG
+ * @buf: buffer containing entire LEB scanned
+ */
+struct ubifs_scan_leb {
+ int lnum;
+ int nodes_cnt;
+ struct list_head nodes;
+ int endpt;
+ int ecc;
+ void *buf;
+};
+
+/**
+ * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
+ * @list: list
+ * @lnum: LEB number
+ * @unmap: OK to unmap this LEB
+ *
+ * This data structure is used to temporary store garbage-collected indexing
+ * LEBs - they are not released immediately, but only after the next commit.
+ * This is needed to guarantee recoverability.
+ */
+struct ubifs_gced_idx_leb {
+ struct list_head list;
+ int lnum;
+ int unmap;
+};
+
+/**
+ * struct ubifs_inode - UBIFS in-memory inode description.
+ * @vfs_inode: VFS inode description object
+ * @creat_sqnum: sequence number at time of creation
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ * this inode
+ * @dirty: non-zero if the inode is dirty
+ * @xattr: non-zero if this is an extended attribute inode
+ * @ui_mutex: serializes inode write-back with the rest of VFS operations,
+ * serializes "clean <-> dirty" state changes, protects @dirty,
+ * @ui_size, and @xattr_size
+ * @ui_lock: protects @synced_i_size
+ * @synced_i_size: synchronized size of inode, i.e. the value of inode size
+ * currently stored on the flash; used only for regular file
+ * inodes
+ * @ui_size: inode size used by UBIFS when writing to flash
+ * @flags: inode flags (@UBIFS_COMPR_FL, etc)
+ * @compr_type: default compression type used for this inode
+ * @data_len: length of the data attached to the inode
+ * @data: inode's data
+ *
+ * @ui_mutex exists for two main reasons. At first it prevents inodes from
+ * being written back while UBIFS changing them, being in the middle of an VFS
+ * operation. This way UBIFS makes sure the inode fields are consistent. For
+ * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * write-back must not write any of them before we have finished.
+ *
+ * The second reason is budgeting - UBIFS has to budget all operations. If an
+ * operation is going to mark an inode dirty, it has to allocate budget for
+ * this. It cannot just mark it dirty because there is no guarantee there will
+ * be enough flash space to write the inode back later. This means UBIFS has
+ * to have full control over inode "clean <-> dirty" transitions (and pages
+ * actually). But unfortunately, VFS marks inodes dirty in many places, and it
+ * does not ask the file-system if it is allowed to do so (there is a notifier,
+ * but it is not enough), i.e., there is no mechanism to synchronize with this.
+ * So UBIFS has its own inode dirty flag and its own mutex to serialize
+ * "clean <-> dirty" transitions.
+ *
+ * The @synced_i_size field is used to make sure we never write pages which are
+ * beyond last synchronized inode size. See 'ubifs_writepage()' for more
+ * information.
+ *
+ * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
+ * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
+ * make sure @inode->i_size is always changed under @ui_mutex, because it
+ * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * could consider to rework locking and base it on "shadow" fields.
+ */
+struct ubifs_inode {
+ struct inode vfs_inode;
+ unsigned long long creat_sqnum;
+ unsigned int xattr_size;
+ unsigned int xattr_cnt;
+ unsigned int xattr_names;
+ unsigned int dirty:1;
+ unsigned int xattr:1;
+ struct mutex ui_mutex;
+ spinlock_t ui_lock;
+ loff_t synced_i_size;
+ loff_t ui_size;
+ int flags;
+ int compr_type;
+ int data_len;
+ void *data;
+};
+
+/**
+ * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
+ * @list: list
+ * @lnum: LEB number of recovered LEB
+ * @endpt: offset where recovery ended
+ *
+ * This structure records a LEB identified during recovery that needs to be
+ * cleaned but was not because UBIFS was mounted read-only. The information
+ * is used to clean the LEB when remounting to read-write mode.
+ */
+struct ubifs_unclean_leb {
+ struct list_head list;
+ int lnum;
+ int endpt;
+};
+
+/*
+ * LEB properties flags.
+ *
+ * LPROPS_UNCAT: not categorized
+ * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
+ * LPROPS_EMPTY: LEB is empty, not taken
+ * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
+ * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
+ * LPROPS_CAT_MASK: mask for the LEB categories above
+ * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
+ * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
+ */
+enum {
+ LPROPS_UNCAT = 0,
+ LPROPS_DIRTY = 1,
+ LPROPS_DIRTY_IDX = 2,
+ LPROPS_FREE = 3,
+ LPROPS_HEAP_CNT = 3,
+ LPROPS_EMPTY = 4,
+ LPROPS_FREEABLE = 5,
+ LPROPS_FRDI_IDX = 6,
+ LPROPS_CAT_MASK = 15,
+ LPROPS_TAKEN = 16,
+ LPROPS_INDEX = 32,
+};
+
+/**
+ * struct ubifs_lprops - logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @flags: LEB properties flags (see above)
+ * @lnum: LEB number
+ * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
+ * @hpos: heap position in heap of same-category lprops (other categories)
+ */
+struct ubifs_lprops {
+ int free;
+ int dirty;
+ int flags;
+ int lnum;
+ union {
+ struct list_head list;
+ int hpos;
+ };
+};
+
+/**
+ * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @tgc: trivial GC flag (1 => unmap after commit end)
+ * @cmt: commit flag (1 => reserved for commit)
+ */
+struct ubifs_lpt_lprops {
+ int free;
+ int dirty;
+ unsigned tgc : 1;
+ unsigned cmt : 1;
+};
+
+/**
+ * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
+ * @empty_lebs: number of empty LEBs
+ * @taken_empty_lebs: number of taken LEBs
+ * @idx_lebs: number of indexing LEBs
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ *
+ * N.B. total_dirty and total_used are different to other total_* fields,
+ * because they account _all_ LEBs, not just data LEBs.
+ *
+ * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
+ * by itself (in which case 'unused_lebs' would be a better name). In the case
+ * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
+ * but unlike other empty LEBs that are 'taken', it may not be written straight
+ * away (i.e. before the next commit start or unmount), so either gc_lnum must
+ * be specially accounted for, or the current approach followed i.e. count it
+ * under 'taken_empty_lebs'.
+ */
+struct ubifs_lp_stats {
+ int empty_lebs;
+ int taken_empty_lebs;
+ int idx_lebs;
+ long long total_free;
+ long long total_dirty;
+ long long total_used;
+ long long total_dead;
+ long long total_dark;
+};
+
+struct ubifs_nnode;
+
+/**
+ * struct ubifs_cnode - LEB Properties Tree common node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
+ * @num: node number
+ */
+struct ubifs_cnode {
+ struct ubifs_nnode *parent;
+ struct ubifs_cnode *cnext;
+ unsigned long flags;
+ int iip;
+ int level;
+ int num;
+};
+
+/**
+ * struct ubifs_pnode - LEB Properties Tree leaf node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always zero for pnodes)
+ * @num: node number
+ * @lprops: LEB properties array
+ */
+struct ubifs_pnode {
+ struct ubifs_nnode *parent;
+ struct ubifs_cnode *cnext;
+ unsigned long flags;
+ int iip;
+ int level;
+ int num;
+ struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_nbranch - LEB Properties Tree internal node branch.
+ * @lnum: LEB number of child
+ * @offs: offset of child
+ * @nnode: nnode child
+ * @pnode: pnode child
+ * @cnode: cnode child
+ */
+struct ubifs_nbranch {
+ int lnum;
+ int offs;
+ union {
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+ struct ubifs_cnode *cnode;
+ };
+};
+
+/**
+ * struct ubifs_nnode - LEB Properties Tree internal node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always greater than zero for nnodes)
+ * @num: node number
+ * @nbranch: branches to child nodes
+ */
+struct ubifs_nnode {
+ struct ubifs_nnode *parent;
+ struct ubifs_cnode *cnext;
+ unsigned long flags;
+ int iip;
+ int level;
+ int num;
+ struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_lpt_heap - heap of categorized lprops.
+ * @arr: heap array
+ * @cnt: number in heap
+ * @max_cnt: maximum number allowed in heap
+ *
+ * There are %LPROPS_HEAP_CNT heaps.
+ */
+struct ubifs_lpt_heap {
+ struct ubifs_lprops **arr;
+ int cnt;
+ int max_cnt;
+};
+
+/*
+ * Return codes for LPT scan callback function.
+ *
+ * LPT_SCAN_CONTINUE: continue scanning
+ * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
+ * LPT_SCAN_STOP: stop scanning
+ */
+enum {
+ LPT_SCAN_CONTINUE = 0,
+ LPT_SCAN_ADD = 1,
+ LPT_SCAN_STOP = 2,
+};
+
+struct ubifs_info;
+
+/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
+typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops,
+ int in_tree, void *data);
+
+/**
+ * struct ubifs_wbuf - UBIFS write-buffer.
+ * @c: UBIFS file-system description object
+ * @buf: write-buffer (of min. flash I/O unit size)
+ * @lnum: logical eraseblock number the write-buffer points to
+ * @offs: write-buffer offset in this logical eraseblock
+ * @avail: number of bytes available in the write-buffer
+ * @used: number of used bytes in the write-buffer
+ * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
+ * %UBI_UNKNOWN)
+ * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
+ * up by 'mutex_lock_nested()).
+ * @sync_callback: write-buffer synchronization callback
+ * @io_mutex: serializes write-buffer I/O
+ * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
+ * fields
+ * @timer: write-buffer timer
+ * @timeout: timer expire interval in jiffies
+ * @need_sync: it is set if its timer expired and needs sync
+ * @next_ino: points to the next position of the following inode number
+ * @inodes: stores the inode numbers of the nodes which are in wbuf
+ *
+ * The write-buffer synchronization callback is called when the write-buffer is
+ * synchronized in order to notify how much space was wasted due to
+ * write-buffer padding and how much free space is left in the LEB.
+ *
+ * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
+ * spin-lock or mutex because they are written under both mutex and spin-lock.
+ * @buf is appended to under mutex but overwritten under both mutex and
+ * spin-lock. Thus the data between @buf and @buf + @used can be read under
+ * spinlock.
+ */
+struct ubifs_wbuf {
+ struct ubifs_info *c;
+ void *buf;
+ int lnum;
+ int offs;
+ int avail;
+ int used;
+ int dtype;
+ int jhead;
+ int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
+ struct mutex io_mutex;
+ spinlock_t lock;
+ struct timer_list timer;
+ int timeout;
+ int need_sync;
+ int next_ino;
+ ino_t *inodes;
+};
+
+/**
+ * struct ubifs_bud - bud logical eraseblock.
+ * @lnum: logical eraseblock number
+ * @start: where the (uncommitted) bud data starts
+ * @jhead: journal head number this bud belongs to
+ * @list: link in the list buds belonging to the same journal head
+ * @rb: link in the tree of all buds
+ */
+struct ubifs_bud {
+ int lnum;
+ int start;
+ int jhead;
+ struct list_head list;
+ struct rb_node rb;
+};
+
+/**
+ * struct ubifs_jhead - journal head.
+ * @wbuf: head's write-buffer
+ * @buds_list: list of bud LEBs belonging to this journal head
+ *
+ * Note, the @buds list is protected by the @c->buds_lock.
+ */
+struct ubifs_jhead {
+ struct ubifs_wbuf wbuf;
+ struct list_head buds_list;
+};
+
+/**
+ * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
+ * @key: key
+ * @znode: znode address in memory
+ * @lnum: LEB number of the indexing node
+ * @offs: offset of the indexing node within @lnum
+ * @len: target node length
+ */
+struct ubifs_zbranch {
+ union ubifs_key key;
+ union {
+ struct ubifs_znode *znode;
+ void *leaf;
+ };
+ int lnum;
+ int offs;
+ int len;
+};
+
+/**
+ * struct ubifs_znode - in-memory representation of an indexing node.
+ * @parent: parent znode or NULL if it is the root
+ * @cnext: next znode to commit
+ * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
+ * @time: last access time (seconds)
+ * @level: level of the entry in the TNC tree
+ * @child_cnt: count of child znodes
+ * @iip: index in parent's zbranch array
+ * @alt: lower bound of key range has altered i.e. child inserted at slot 0
+ * @lnum: LEB number of the corresponding indexing node
+ * @offs: offset of the corresponding indexing node
+ * @len: length of the corresponding indexing node
+ * @zbranch: array of znode branches (@c->fanout elements)
+ */
+struct ubifs_znode {
+ struct ubifs_znode *parent;
+ struct ubifs_znode *cnext;
+ unsigned long flags;
+ unsigned long time;
+ int level;
+ int child_cnt;
+ int iip;
+ int alt;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ int lnum, offs, len;
+#endif
+ struct ubifs_zbranch zbranch[];
+};
+
+/**
+ * struct ubifs_node_range - node length range description data structure.
+ * @len: fixed node length
+ * @min_len: minimum possible node length
+ * @max_len: maximum possible node length
+ *
+ * If @max_len is %0, the node has fixed length @len.
+ */
+struct ubifs_node_range {
+ union {
+ int len;
+ int min_len;
+ };
+ int max_len;
+};
+
+/**
+ * struct ubifs_compressor - UBIFS compressor description structure.
+ * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
+ * @cc: cryptoapi compressor handle
+ * @comp_mutex: mutex used during compression
+ * @decomp_mutex: mutex used during decompression
+ * @name: compressor name
+ * @capi_name: cryptoapi compressor name
+ */
+struct ubifs_compressor {
+ int compr_type;
+ struct crypto_comp *cc;
+ struct mutex *comp_mutex;
+ struct mutex *decomp_mutex;
+ const char *name;
+ const char *capi_name;
+};
+
+/**
+ * struct ubifs_budget_req - budget requirements of an operation.
+ *
+ * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ * should not try to call write-back
+ * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
+ * have to be re-calculated
+ * @new_page: non-zero if the operation adds a new page
+ * @dirtied_page: non-zero if the operation makes a page dirty
+ * @new_dent: non-zero if the operation adds a new directory entry
+ * @mod_dent: non-zero if the operation removes or modifies an existing
+ * directory entry
+ * @new_ino: non-zero if the operation adds a new inode
+ * @new_ino_d: now much data newly created inode contains
+ * @dirtied_ino: how many inodes the operation makes dirty
+ * @dirtied_ino_d: now much data dirtied inode contains
+ * @idx_growth: how much the index will supposedly grow
+ * @data_growth: how much new data the operation will supposedly add
+ * @dd_growth: how much data that makes other data dirty the operation will
+ * supposedly add
+ *
+ * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
+ * budgeting subsystem caches index and data growth values there to avoid
+ * re-calculating them when the budget is released. However, if @idx_growth is
+ * %-1, it is calculated by the release function using other fields.
+ *
+ * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
+ * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
+ * dirty by the re-name operation.
+ */
+struct ubifs_budget_req {
+ unsigned int fast:1;
+ unsigned int recalculate:1;
+ unsigned int new_page:1;
+ unsigned int dirtied_page:1;
+ unsigned int new_dent:1;
+ unsigned int mod_dent:1;
+ unsigned int new_ino:1;
+ unsigned int new_ino_d:13;
+#ifndef UBIFS_DEBUG
+ unsigned int dirtied_ino:4;
+ unsigned int dirtied_ino_d:15;
+#else
+ /* Not bit-fields to check for overflows */
+ unsigned int dirtied_ino;
+ unsigned int dirtied_ino_d;
+#endif
+ int idx_growth;
+ int data_growth;
+ int dd_growth;
+};
+
+/**
+ * struct ubifs_orphan - stores the inode number of an orphan.
+ * @rb: rb-tree node of rb-tree of orphans sorted by inode number
+ * @list: list head of list of orphans in order added
+ * @new_list: list head of list of orphans added since the last commit
+ * @cnext: next orphan to commit
+ * @dnext: next orphan to delete
+ * @inum: inode number
+ * @new: %1 => added since the last commit, otherwise %0
+ */
+struct ubifs_orphan {
+ struct rb_node rb;
+ struct list_head list;
+ struct list_head new_list;
+ struct ubifs_orphan *cnext;
+ struct ubifs_orphan *dnext;
+ ino_t inum;
+ int new;
+};
+
+/**
+ * struct ubifs_mount_opts - UBIFS-specific mount options information.
+ * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ */
+struct ubifs_mount_opts {
+ unsigned int unmount_mode:2;
+};
+
+/**
+ * struct ubifs_info - UBIFS file-system description data structure
+ * (per-superblock).
+ * @vfs_sb: VFS @struct super_block object
+ * @bdi: backing device info object to make VFS happy and disable readahead
+ *
+ * @highest_inum: highest used inode number
+ * @vfs_gen: VFS inode generation counter
+ * @max_sqnum: current global sequence number
+ * @cmt_no: commit number (last successfully completed commit)
+ * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ * @fmt_version: UBIFS on-flash format version
+ * @uuid: UUID from super block
+ *
+ * @lhead_lnum: log head logical eraseblock number
+ * @lhead_offs: log head offset
+ * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
+ * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
+ * @bud_bytes
+ * @min_log_bytes: minimum required number of bytes in the log
+ * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
+ * committed buds
+ *
+ * @buds: tree of all buds indexed by bud LEB number
+ * @bud_bytes: how many bytes of flash is used by buds
+ * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
+ * lists
+ * @jhead_cnt: count of journal heads
+ * @jheads: journal heads (head zero is base head)
+ * @max_bud_bytes: maximum number of bytes allowed in buds
+ * @bg_bud_bytes: number of bud bytes when background commit is initiated
+ * @old_buds: buds to be released after commit ends
+ * @max_bud_cnt: maximum number of buds
+ *
+ * @commit_sem: synchronizes committer with other processes
+ * @cmt_state: commit state
+ * @cs_lock: commit state lock
+ * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ * @fast_unmount: do not run journal commit before un-mounting
+ * @big_lpt: flag that LPT is too big to write whole during commit
+ * @check_lpt_free: flag that indicates LPT GC may be needed
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ * optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ * pool is full
+ *
+ * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
+ * @calc_idx_sz
+ * @zroot: zbranch which points to the root index node and znode
+ * @cnext: next znode to commit
+ * @enext: next znode to commit to empty space
+ * @gap_lebs: array of LEBs used by the in-gaps commit method
+ * @cbuf: commit buffer
+ * @ileb_buf: buffer for commit in-the-gaps method
+ * @ileb_len: length of data in ileb_buf
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @ilebs: pre-allocated index LEBs
+ * @ileb_cnt: number of pre-allocated index LEBs
+ * @ileb_nxt: next pre-allocated index LEBs
+ * @old_idx: tree of index nodes obsoleted since the last commit start
+ * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * @mst_node: master node
+ * @mst_offs: offset of valid master node
+ * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ *
+ * @log_lebs: number of logical eraseblocks in the log
+ * @log_bytes: log size in bytes
+ * @log_last: last LEB of the log
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @lpt_first: first LEB of the lprops table area
+ * @lpt_last: last LEB of the lprops table area
+ * @orph_lebs: number of LEBs used for the orphan area
+ * @orph_first: first LEB of the orphan area
+ * @orph_last: last LEB of the orphan area
+ * @main_lebs: count of LEBs in the main area
+ * @main_first: first LEB of the main area
+ * @main_bytes: main area size in bytes
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ *
+ * @key_hash_type: type of the key hash
+ * @key_hash: direntry key hash function
+ * @key_fmt: key format
+ * @key_len: key length
+ * @fanout: fanout of the index tree (number of links per indexing node)
+ *
+ * @min_io_size: minimal input/output unit size
+ * @min_io_shift: number of bits in @min_io_size minus one
+ * @leb_size: logical eraseblock size in bytes
+ * @half_leb_size: half LEB size
+ * @leb_cnt: count of logical eraseblocks
+ * @max_leb_cnt: maximum count of logical eraseblocks
+ * @old_leb_cnt: count of logical eraseblocks before re-size
+ * @ro_media: the underlying UBI volume is read-only
+ *
+ * @dirty_pg_cnt: number of dirty pages (not used)
+ * @dirty_zn_cnt: number of dirty znodes
+ * @clean_zn_cnt: number of clean znodes
+ *
+ * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @budg_data_growth: amount of bytes budgeted for cached data
+ * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ * other data dirty
+ * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
+ * but which still have to be taken into account because
+ * the index has not been committed so far
+ * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
+ * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @old_idx_sz: size of index on flash
+ * @calc_idx_sz: temporary variable which is used to calculate new index size
+ * (contains accurate new index size at end of TNC commit start)
+ * @lst: lprops statistics
+ *
+ * @page_budget: budget for a page
+ * @inode_budget: budget for an inode
+ * @dent_budget: budget for a directory entry
+ *
+ * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
+ * I/O unit
+ * @mst_node_alsz: master node aligned size
+ * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
+ * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
+ * @max_inode_sz: maximum possible inode size in bytes
+ * @max_znode_sz: size of znode in bytes
+ * @dead_wm: LEB dead space watermark
+ * @dark_wm: LEB dark space watermark
+ * @block_cnt: count of 4KiB blocks on the FS
+ *
+ * @ranges: UBIFS node length ranges
+ * @ubi: UBI volume descriptor
+ * @di: UBI device information
+ * @vi: UBI volume information
+ *
+ * @orph_tree: rb-tree of orphan inode numbers
+ * @orph_list: list of orphan inode numbers in order added
+ * @orph_new: list of orphan inode numbers added since last commit
+ * @orph_cnext: next orphan to commit
+ * @orph_dnext: next orphan to delete
+ * @orphan_lock: lock for orph_tree and orph_new
+ * @orph_buf: buffer for orphan nodes
+ * @new_orphans: number of orphans since last commit
+ * @cmt_orphans: number of orphans being committed
+ * @tot_orphans: number of orphans in the rb_tree
+ * @max_orphans: maximum number of orphans allowed
+ * @ohead_lnum: orphan head LEB number
+ * @ohead_offs: orphan head offset
+ * @no_orphs: non-zero if there are no orphans
+ *
+ * @bgt: UBIFS background thread
+ * @bgt_name: background thread name
+ * @need_bgt: if background thread should run
+ * @need_wbuf_sync: if write-buffers have to be synchronized
+ *
+ * @gc_lnum: LEB number used for garbage collection
+ * @sbuf: a buffer of LEB size used by GC and replay for scanning
+ * @idx_gc: list of index LEBs that have been garbage collected
+ * @idx_gc_cnt: number of elements on the idx_gc list
+ *
+ * @infos_list: links all 'ubifs_info' objects
+ * @umount_mutex: serializes shrinker and un-mount
+ * @shrinker_run_no: shrinker run number
+ *
+ * @space_bits: number of bits needed to record free or dirty space
+ * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
+ * @lpt_offs_bits: number of bits needed to record an offset in the LPT
+ * @lpt_spc_bits: number of bits needed to space in the LPT
+ * @pcnt_bits: number of bits needed to record pnode or nnode number
+ * @lnum_bits: number of bits needed to record LEB number
+ * @nnode_sz: size of on-flash nnode
+ * @pnode_sz: size of on-flash pnode
+ * @ltab_sz: size of on-flash LPT lprops table
+ * @lsave_sz: size of on-flash LPT save table
+ * @pnode_cnt: number of pnodes
+ * @nnode_cnt: number of nnodes
+ * @lpt_hght: height of the LPT
+ * @pnodes_have: number of pnodes in memory
+ *
+ * @lp_mutex: protects lprops table and all the other lprops-related fields
+ * @lpt_lnum: LEB number of the root nnode of the LPT
+ * @lpt_offs: offset of the root nnode of the LPT
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
+ * @dirty_nn_cnt: number of dirty nnodes
+ * @dirty_pn_cnt: number of dirty pnodes
+ * @lpt_sz: LPT size
+ * @lpt_nod_buf: buffer for an on-flash nnode or pnode
+ * @lpt_buf: buffer of LEB size used by LPT
+ * @nroot: address in memory of the root nnode of the LPT
+ * @lpt_cnext: next LPT node to commit
+ * @lpt_heap: array of heaps of categorized lprops
+ * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
+ * previous commit start
+ * @uncat_list: list of un-categorized LEBs
+ * @empty_list: list of empty LEBs
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_cnt: number of freeable LEBs in @freeable_list
+ *
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @ltab: LPT's own lprops table
+ * @ltab_cmt: LPT's own lprops table (commit copy)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @lsave_lnum: LEB number of LPT's save table
+ * @lsave_offs: offset of LPT's save table
+ * @lsave: LPT's save table
+ * @lscan_lnum: LEB number of last LPT scan
+ *
+ * @rp_size: size of the reserved pool in bytes
+ * @report_rp_size: size of the reserved pool reported to user-space
+ * @rp_uid: reserved pool user ID
+ * @rp_gid: reserved pool group ID
+ *
+ * @empty: if the UBI device is empty
+ * @replay_tree: temporary tree used during journal replay
+ * @replay_list: temporary list used during journal replay
+ * @replay_buds: list of buds to replay
+ * @cs_sqnum: sequence number of first node in the log (commit start node)
+ * @replay_sqnum: sequence number of node currently being replayed
+ * @need_recovery: file-system needs recovery
+ * @replaying: set to %1 during journal replay
+ * @unclean_leb_list: LEBs to recover when mounting ro to rw
+ * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ * @size_tree: inode size information for recovery
+ * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @mount_opts: UBIFS-specific mount options
+ *
+ * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ */
+struct ubifs_info {
+ struct super_block *vfs_sb;
+ struct backing_dev_info bdi;
+
+ ino_t highest_inum;
+ unsigned int vfs_gen;
+ unsigned long long max_sqnum;
+ unsigned long long cmt_no;
+ spinlock_t cnt_lock;
+ int fmt_version;
+ unsigned char uuid[16];
+
+ int lhead_lnum;
+ int lhead_offs;
+ int ltail_lnum;
+ struct mutex log_mutex;
+ int min_log_bytes;
+ long long cmt_bud_bytes;
+
+ struct rb_root buds;
+ long long bud_bytes;
+ spinlock_t buds_lock;
+ int jhead_cnt;
+ struct ubifs_jhead *jheads;
+ long long max_bud_bytes;
+ long long bg_bud_bytes;
+ struct list_head old_buds;
+ int max_bud_cnt;
+
+ struct rw_semaphore commit_sem;
+ int cmt_state;
+ spinlock_t cs_lock;
+ wait_queue_head_t cmt_wq;
+ unsigned int fast_unmount:1;
+ unsigned int big_lpt:1;
+ unsigned int check_lpt_free:1;
+ unsigned int nospace:1;
+ unsigned int nospace_rp:1;
+
+ struct mutex tnc_mutex;
+ struct ubifs_zbranch zroot;
+ struct ubifs_znode *cnext;
+ struct ubifs_znode *enext;
+ int *gap_lebs;
+ void *cbuf;
+ void *ileb_buf;
+ int ileb_len;
+ int ihead_lnum;
+ int ihead_offs;
+ int *ilebs;
+ int ileb_cnt;
+ int ileb_nxt;
+ struct rb_root old_idx;
+ int *bottom_up_buf;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ int new_ihead_lnum;
+ int new_ihead_offs;
+#endif
+
+ struct ubifs_mst_node *mst_node;
+ int mst_offs;
+ struct mutex mst_mutex;
+
+ int log_lebs;
+ long long log_bytes;
+ int log_last;
+ int lpt_lebs;
+ int lpt_first;
+ int lpt_last;
+ int orph_lebs;
+ int orph_first;
+ int orph_last;
+ int main_lebs;
+ int main_first;
+ long long main_bytes;
+ int default_compr;
+
+ uint8_t key_hash_type;
+ uint32_t (*key_hash)(const char *str, int len);
+ int key_fmt;
+ int key_len;
+ int fanout;
+
+ int min_io_size;
+ int min_io_shift;
+ int leb_size;
+ int half_leb_size;
+ int leb_cnt;
+ int max_leb_cnt;
+ int old_leb_cnt;
+ int ro_media;
+
+ atomic_long_t dirty_pg_cnt;
+ atomic_long_t dirty_zn_cnt;
+ atomic_long_t clean_zn_cnt;
+
+ long long budg_idx_growth;
+ long long budg_data_growth;
+ long long budg_dd_growth;
+ long long budg_uncommitted_idx;
+ spinlock_t space_lock;
+ int min_idx_lebs;
+ unsigned long long old_idx_sz;
+ unsigned long long calc_idx_sz;
+ struct ubifs_lp_stats lst;
+
+ int page_budget;
+ int inode_budget;
+ int dent_budget;
+
+ int ref_node_alsz;
+ int mst_node_alsz;
+ int min_idx_node_sz;
+ int max_idx_node_sz;
+ long long max_inode_sz;
+ int max_znode_sz;
+ int dead_wm;
+ int dark_wm;
+ int block_cnt;
+
+ struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
+ struct ubi_volume_desc *ubi;
+ struct ubi_device_info di;
+ struct ubi_volume_info vi;
+
+ struct rb_root orph_tree;
+ struct list_head orph_list;
+ struct list_head orph_new;
+ struct ubifs_orphan *orph_cnext;
+ struct ubifs_orphan *orph_dnext;
+ spinlock_t orphan_lock;
+ void *orph_buf;
+ int new_orphans;
+ int cmt_orphans;
+ int tot_orphans;
+ int max_orphans;
+ int ohead_lnum;
+ int ohead_offs;
+ int no_orphs;
+
+ struct task_struct *bgt;
+ char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
+ int need_bgt;
+ int need_wbuf_sync;
+
+ int gc_lnum;
+ void *sbuf;
+ struct list_head idx_gc;
+ int idx_gc_cnt;
+
+ struct list_head infos_list;
+ struct mutex umount_mutex;
+ unsigned int shrinker_run_no;
+
+ int space_bits;
+ int lpt_lnum_bits;
+ int lpt_offs_bits;
+ int lpt_spc_bits;
+ int pcnt_bits;
+ int lnum_bits;
+ int nnode_sz;
+ int pnode_sz;
+ int ltab_sz;
+ int lsave_sz;
+ int pnode_cnt;
+ int nnode_cnt;
+ int lpt_hght;
+ int pnodes_have;
+
+ struct mutex lp_mutex;
+ int lpt_lnum;
+ int lpt_offs;
+ int nhead_lnum;
+ int nhead_offs;
+ int lpt_drty_flgs;
+ int dirty_nn_cnt;
+ int dirty_pn_cnt;
+ long long lpt_sz;
+ void *lpt_nod_buf;
+ void *lpt_buf;
+ struct ubifs_nnode *nroot;
+ struct ubifs_cnode *lpt_cnext;
+ struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
+ struct ubifs_lpt_heap dirty_idx;
+ struct list_head uncat_list;
+ struct list_head empty_list;
+ struct list_head freeable_list;
+ struct list_head frdi_idx_list;
+ int freeable_cnt;
+
+ int ltab_lnum;
+ int ltab_offs;
+ struct ubifs_lpt_lprops *ltab;
+ struct ubifs_lpt_lprops *ltab_cmt;
+ int lsave_cnt;
+ int lsave_lnum;
+ int lsave_offs;
+ int *lsave;
+ int lscan_lnum;
+
+ long long rp_size;
+ long long report_rp_size;
+ uid_t rp_uid;
+ gid_t rp_gid;
+
+ /* The below fields are used only during mounting and re-mounting */
+ int empty;
+ struct rb_root replay_tree;
+ struct list_head replay_list;
+ struct list_head replay_buds;
+ unsigned long long cs_sqnum;
+ unsigned long long replay_sqnum;
+ int need_recovery;
+ int replaying;
+ struct list_head unclean_leb_list;
+ struct ubifs_mst_node *rcvrd_mst_node;
+ struct rb_root size_tree;
+ int remounting_rw;
+ struct ubifs_mount_opts mount_opts;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ void *dbg_buf;
+ struct ubifs_zbranch old_zroot;
+ int old_zroot_level;
+ unsigned long long old_zroot_sqnum;
+ int failure_mode;
+ int fail_delay;
+ unsigned long fail_timeout;
+ unsigned int fail_cnt;
+ unsigned int fail_cnt_max;
+#endif
+};
+
+extern struct list_head ubifs_infos;
+extern spinlock_t ubifs_infos_lock;
+extern atomic_long_t ubifs_clean_zn_cnt;
+extern struct kmem_cache *ubifs_inode_slab;
+extern struct super_operations ubifs_super_operations;
+extern struct address_space_operations ubifs_file_address_operations;
+extern struct file_operations ubifs_file_operations;
+extern struct inode_operations ubifs_file_inode_operations;
+extern struct file_operations ubifs_dir_operations;
+extern struct inode_operations ubifs_dir_inode_operations;
+extern struct inode_operations ubifs_symlink_inode_operations;
+extern struct backing_dev_info ubifs_backing_dev_info;
+extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/* io.c */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+ int dtype);
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+ int lnum, int offs);
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+ int lnum, int offs);
+int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
+ int offs, int dtype);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+ int offs, int quiet);
+void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
+int ubifs_io_init(struct ubifs_info *c);
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
+int ubifs_bg_wbufs_sync(struct ubifs_info *c);
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
+
+/* scan.c */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf);
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int quiet);
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf);
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ int lnum, int offs);
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ void *buf, int offs);
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+ void *buf);
+
+/* log.c */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
+void ubifs_create_buds_lists(struct ubifs_info *c);
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
+int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
+int ubifs_consolidate_log(struct ubifs_info *c);
+
+/* journal.c */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+ const struct qstr *nm, const struct inode *inode,
+ int deletion, int xent);
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+ const union ubifs_key *key, const void *buf, int len);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
+ int last_reference);
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry, int sync);
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+ loff_t old_size, loff_t new_size);
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+ const struct inode *inode, const struct qstr *nm);
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
+ const struct inode *inode2);
+
+/* budget.c */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+ struct ubifs_inode *ui);
+int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
+ struct ubifs_budget_req *req);
+void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
+ struct ubifs_budget_req *req);
+void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
+ struct ubifs_budget_req *req);
+long long ubifs_budg_get_free_space(struct ubifs_info *c);
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
+void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
+
+/* find.c */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+ int squeeze);
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+ int min_space, int pick_free);
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
+
+/* tnc.c */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n);
+int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
+ void *node);
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, const struct qstr *nm);
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, int *lnum, int *offs);
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+ int offs, int len);
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+ int old_lnum, int old_offs, int lnum, int offs, int len);
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+ int lnum, int offs, int len, const struct qstr *nm);
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+ const struct qstr *nm);
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+ union ubifs_key *to_key);
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+ union ubifs_key *key,
+ const struct qstr *nm);
+void ubifs_tnc_close(struct ubifs_info *c);
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs, int is_idx);
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs);
+/* Shared by tnc.c for tnc_commit.c */
+void destroy_old_idx(struct ubifs_info *c);
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs);
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+
+/* tnc_misc.c */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+ struct ubifs_znode *znode);
+int ubifs_search_zbranch(const struct ubifs_info *c,
+ const struct ubifs_znode *znode,
+ const union ubifs_key *key, int *n);
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr,
+ struct ubifs_znode *parent, int iip);
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node);
+
+/* tnc_commit.c */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int ubifs_tnc_end_commit(struct ubifs_info *c);
+
+/* shrinker.c */
+int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+
+/* commit.c */
+int ubifs_bg_thread(void *info);
+void ubifs_commit_required(struct ubifs_info *c);
+void ubifs_request_bg_commit(struct ubifs_info *c);
+int ubifs_run_commit(struct ubifs_info *c);
+void ubifs_recovery_commit(struct ubifs_info *c);
+int ubifs_gc_should_commit(struct ubifs_info *c);
+void ubifs_wait_for_commit(struct ubifs_info *c);
+
+/* master.c */
+int ubifs_read_master(struct ubifs_info *c);
+int ubifs_write_master(struct ubifs_info *c);
+
+/* sb.c */
+int ubifs_read_superblock(struct ubifs_info *c);
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+
+/* replay.c */
+int ubifs_validate_entry(struct ubifs_info *c,
+ const struct ubifs_dent_node *dent);
+int ubifs_replay_journal(struct ubifs_info *c);
+
+/* gc.c */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
+int ubifs_gc_start_commit(struct ubifs_info *c);
+int ubifs_gc_end_commit(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_get_idx_gc_leb(struct ubifs_info *c);
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
+
+/* orphan.c */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
+int ubifs_orphan_start_commit(struct ubifs_info *c);
+int ubifs_orphan_end_commit(struct ubifs_info *c);
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+
+/* lpt.c */
+int ubifs_calc_lpt_geom(struct ubifs_info *c);
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+ int *lpt_lebs, int *big_lpt);
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+ ubifs_lpt_scan_callback scan_cb, void *data);
+
+/* Shared by lpt.c for lpt_commit.c */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+ struct ubifs_lpt_lprops *ltab);
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+ struct ubifs_pnode *pnode);
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode);
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip);
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip);
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
+struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+
+/* lpt_commit.c */
+int ubifs_lpt_start_commit(struct ubifs_info *c);
+int ubifs_lpt_end_commit(struct ubifs_info *c);
+int ubifs_lpt_post_commit(struct ubifs_info *c);
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
+
+/* lprops.c */
+void ubifs_get_lprops(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+ const struct ubifs_lprops *lp,
+ int free, int dirty, int flags,
+ int idx_gc_cnt);
+void ubifs_release_lprops(struct ubifs_info *c);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+ int cat);
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+ struct ubifs_lprops *new_lprops);
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+ const struct ubifs_lprops *lprops);
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean, int idx_gc_cnt);
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean);
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+
+/* file.c */
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+
+/* dir.c */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+ int mode);
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+
+/* xattr.c */
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+ size_t size);
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ubifs_removexattr(struct dentry *dentry, const char *name);
+
+/* super.c */
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
+
+/* recovery.c */
+int ubifs_recover_master_node(struct ubifs_info *c);
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf, int grouped);
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf);
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
+int ubifs_rcvry_gc_commit(struct ubifs_info *c);
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+ int deletion, loff_t new_size);
+int ubifs_recover_size(struct ubifs_info *c);
+void ubifs_destroy_size_tree(struct ubifs_info *c);
+
+/* ioctl.c */
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void ubifs_set_inode_flags(struct inode *inode);
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#endif
+
+/* compressor.c */
+int __init ubifs_compressors_init(void);
+void __exit ubifs_compressors_exit(void);
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+ int *compr_type);
+int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
+ int compr_type);
+
+#include "debug.h"
+#include "misc.h"
+#include "key.h"
+
+#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 00000000000..1388a078e1a
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,581 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS extended attributes support.
+ *
+ * Extended attributes are implemented as regular inodes with attached data,
+ * which limits extended attribute size to UBIFS block size (4KiB). Names of
+ * extended attributes are described by extended attribute entries (xentries),
+ * which are almost identical to directory entries, but have different key type.
+ *
+ * In other words, the situation with extended attributes is very similar to
+ * directories. Indeed, any inode (but of course not xattr inodes) may have a
+ * number of associated xentries, just like directory inodes have associated
+ * directory entries. Extended attribute entries store the name of the extended
+ * attribute, the host inode number, and the extended attribute inode number.
+ * Similarly, direntries store the name, the parent and the target inode
+ * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
+ * extended attributes.
+ *
+ * The number of extended attributes is not limited, but there is Linux
+ * limitation on the maximum possible size of the list of all extended
+ * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
+ * the sum of all extended attribute names of the inode does not exceed that
+ * limit.
+ *
+ * Extended attributes are synchronous, which means they are written to the
+ * flash media synchronously and there is no write-back for extended attribute
+ * inodes. The extended attribute values are not stored in compressed form on
+ * the media.
+ *
+ * Since extended attributes are represented by regular inodes, they are cached
+ * in the VFS inode cache. The xentries are cached in the LNC cache (see
+ * tnc.c).
+ *
+ * ACL support is not implemented.
+ */
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include "ubifs.h"
+
+/*
+ * Limit the number of extended attributes per inode so that the total size
+ * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ */
+#define MAX_XATTRS_PER_INODE 65535
+
+/*
+ * Extended attribute type constants.
+ *
+ * USER_XATTR: user extended attribute ("user.*")
+ * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
+ * SECURITY_XATTR: security extended attribute ("security.*")
+ */
+enum {
+ USER_XATTR,
+ TRUSTED_XATTR,
+ SECURITY_XATTR,
+};
+
+static struct inode_operations none_inode_operations;
+static struct address_space_operations none_address_operations;
+static struct file_operations none_file_operations;
+
+/**
+ * create_xattr - create an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @nm: extended attribute name
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This is a helper function which creates an extended attribute of name @nm
+ * and value @value for inode @host. The host inode is also updated on flash
+ * because the ctime and extended attribute accounting data changes. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int create_xattr(struct ubifs_info *c, struct inode *host,
+ const struct qstr *nm, const void *value, int size)
+{
+ int err;
+ struct inode *inode;
+ struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .new_ino_d = size, .dirtied_ino = 1,
+ .dirtied_ino_d = host_ui->data_len};
+
+ if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+ return -ENOSPC;
+ /*
+ * Linux limits the maximum size of the extended attribute names list
+ * to %XATTR_LIST_MAX. This means we should not allow creating more*
+ * extended attributes if the name list becomes larger. This limitation
+ * is artificial for UBIFS, though.
+ */
+ if (host_ui->xattr_names + host_ui->xattr_cnt +
+ nm->len + 1 > XATTR_LIST_MAX)
+ return -ENOSPC;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ mutex_lock(&host_ui->ui_mutex);
+ /* Re-define all operations to be "nothing" */
+ inode->i_mapping->a_ops = &none_address_operations;
+ inode->i_op = &none_inode_operations;
+ inode->i_fop = &none_file_operations;
+
+ inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
+ ui = ubifs_inode(inode);
+ ui->xattr = 1;
+ ui->flags |= UBIFS_XATTR_FL;
+ ui->data = kmalloc(size, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(ui->data, value, size);
+ host->i_ctime = ubifs_current_time(host);
+ host_ui->xattr_cnt += 1;
+ host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(size);
+ host_ui->xattr_names += nm->len;
+
+ /*
+ * We do not use i_size_write() because nobody can race with us as we
+ * are holding host @host->i_mutex - every xattr operation for this
+ * inode is serialized by it.
+ */
+ inode->i_size = ui->ui_size = size;
+ ui->data_len = size;
+ err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&host_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ iput(inode);
+ return 0;
+
+out_cancel:
+ host_ui->xattr_cnt -= 1;
+ host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+out_unlock:
+ mutex_unlock(&host_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This helper function changes the value of extended attribute @inode with new
+ * data from @value. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int change_xattr(struct ubifs_info *c, struct inode *host,
+ struct inode *inode, const void *value, int size)
+{
+ int err;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_budget_req req = { .dirtied_ino = 2,
+ .dirtied_ino_d = size + host_ui->data_len };
+
+ ubifs_assert(ui->data_len == inode->i_size);
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&host_ui->ui_mutex);
+ host->i_ctime = ubifs_current_time(host);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(size);
+
+ kfree(ui->data);
+ ui->data = kmalloc(size, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(ui->data, value, size);
+ inode->i_size = ui->ui_size = size;
+ ui->data_len = size;
+
+ /*
+ * It is important to write the host inode after the xattr inode
+ * because if the host inode gets synchronized (via 'fsync()'), then
+ * the extended attribute inode gets synchronized, because it goes
+ * before the host inode in the write-buffer.
+ */
+ err = ubifs_jnl_change_xattr(c, inode, host);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&host_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ return 0;
+
+out_cancel:
+ host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+ host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+ make_bad_inode(inode);
+out_unlock:
+ mutex_unlock(&host_ui->ui_mutex);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * check_namespace - check extended attribute name-space.
+ * @nm: extended attribute name
+ *
+ * This function makes sure the extended attribute name belongs to one of the
+ * supported extended attribute name-spaces. Returns name-space index in case
+ * of success and a negative error code in case of failure.
+ */
+static int check_namespace(const struct qstr *nm)
+{
+ int type;
+
+ if (nm->len > UBIFS_MAX_NLEN)
+ return -ENAMETOOLONG;
+
+ if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN)) {
+ if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+ return -EINVAL;
+ type = TRUSTED_XATTR;
+ } else if (!strncmp(nm->name, XATTR_USER_PREFIX,
+ XATTR_USER_PREFIX_LEN)) {
+ if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
+ return -EINVAL;
+ type = USER_XATTR;
+ } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN)) {
+ if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+ return -EINVAL;
+ type = SECURITY_XATTR;
+ } else
+ return -EOPNOTSUPP;
+
+ return type;
+}
+
+static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
+{
+ struct inode *inode;
+
+ inode = ubifs_iget(c->vfs_sb, inum);
+ if (IS_ERR(inode)) {
+ ubifs_err("dead extended attribute entry, error %d",
+ (int)PTR_ERR(inode));
+ return inode;
+ }
+ if (ubifs_inode(inode)->xattr)
+ return inode;
+ ubifs_err("corrupt extended attribute entry");
+ iput(inode);
+ return ERR_PTR(-EINVAL);
+}
+
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct inode *inode, *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct qstr nm = { .name = name, .len = strlen(name) };
+ struct ubifs_dent_node *xent;
+ union ubifs_key key;
+ int err, type;
+
+ dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
+ host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+
+ if (size > UBIFS_MAX_INO_DATA)
+ return -ERANGE;
+
+ type = check_namespace(&nm);
+ if (type < 0)
+ return type;
+
+ xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ /*
+ * The extended attribute entries are stored in LNC, so multiple
+ * look-ups do not involve reading the flash.
+ */
+ xent_key_init(c, &key, host->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+ if (err) {
+ if (err != -ENOENT)
+ goto out_free;
+
+ if (flags & XATTR_REPLACE)
+ /* We are asked not to create the xattr */
+ err = -ENODATA;
+ else
+ err = create_xattr(c, host, &nm, value, size);
+ goto out_free;
+ }
+
+ if (flags & XATTR_CREATE) {
+ /* We are asked not to replace the xattr */
+ err = -EEXIST;
+ goto out_free;
+ }
+
+ inode = iget_xattr(c, le64_to_cpu(xent->inum));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_free;
+ }
+
+ err = change_xattr(c, host, inode, value, size);
+ iput(inode);
+
+out_free:
+ kfree(xent);
+ return err;
+}
+
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+ size_t size)
+{
+ struct inode *inode, *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct qstr nm = { .name = name, .len = strlen(name) };
+ struct ubifs_inode *ui;
+ struct ubifs_dent_node *xent;
+ union ubifs_key key;
+ int err;
+
+ dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
+ host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+
+ err = check_namespace(&nm);
+ if (err < 0)
+ return err;
+
+ xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ mutex_lock(&host->i_mutex);
+ xent_key_init(c, &key, host->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+ if (err) {
+ if (err == -ENOENT)
+ err = -ENODATA;
+ goto out_unlock;
+ }
+
+ inode = iget_xattr(c, le64_to_cpu(xent->inum));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_unlock;
+ }
+
+ ui = ubifs_inode(inode);
+ ubifs_assert(inode->i_size == ui->data_len);
+ ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+
+ if (buf) {
+ /* If @buf is %NULL we are supposed to return the length */
+ if (ui->data_len > size) {
+ dbg_err("buffer size %zd, xattr len %d",
+ size, ui->data_len);
+ err = -ERANGE;
+ goto out_iput;
+ }
+
+ memcpy(buf, ui->data, ui->data_len);
+ }
+ err = ui->data_len;
+
+out_iput:
+ iput(inode);
+out_unlock:
+ mutex_unlock(&host->i_mutex);
+ kfree(xent);
+ return err;
+}
+
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ union ubifs_key key;
+ struct inode *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_dent_node *xent, *pxent = NULL;
+ int err, len, written = 0;
+ struct qstr nm = { .name = NULL };
+
+ dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
+ dentry->d_name.len, dentry->d_name.name, size);
+
+ len = host_ui->xattr_names + host_ui->xattr_cnt;
+ if (!buffer)
+ /*
+ * We should return the minimum buffer size which will fit a
+ * null-terminated list of all the extended attribute names.
+ */
+ return len;
+
+ if (len > size)
+ return -ERANGE;
+
+ lowest_xent_key(c, &key, host->i_ino);
+
+ mutex_lock(&host->i_mutex);
+ while (1) {
+ int type;
+
+ xent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (unlikely(IS_ERR(xent))) {
+ err = PTR_ERR(xent);
+ break;
+ }
+
+ nm.name = xent->name;
+ nm.len = le16_to_cpu(xent->nlen);
+
+ type = check_namespace(&nm);
+ if (unlikely(type < 0)) {
+ err = type;
+ break;
+ }
+
+ /* Show trusted namespace only for "power" users */
+ if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+ memcpy(buffer + written, nm.name, nm.len + 1);
+ written += nm.len + 1;
+ }
+
+ kfree(pxent);
+ pxent = xent;
+ key_read(c, &xent->key, &key);
+ }
+ mutex_unlock(&host->i_mutex);
+
+ kfree(pxent);
+ if (err != -ENOENT) {
+ ubifs_err("cannot find next direntry, error %d", err);
+ return err;
+ }
+
+ ubifs_assert(written <= size);
+ return written;
+}
+
+static int remove_xattr(struct ubifs_info *c, struct inode *host,
+ struct inode *inode, const struct qstr *nm)
+{
+ int err;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
+ .dirtied_ino_d = host_ui->data_len };
+
+ ubifs_assert(ui->data_len == inode->i_size);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&host_ui->ui_mutex);
+ host->i_ctime = ubifs_current_time(host);
+ host_ui->xattr_cnt -= 1;
+ host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_names -= nm->len;
+
+ err = ubifs_jnl_delete_xattr(c, host, inode, nm);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&host_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ return 0;
+
+out_cancel:
+ host_ui->xattr_cnt += 1;
+ host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+ mutex_unlock(&host_ui->ui_mutex);
+ ubifs_release_budget(c, &req);
+ make_bad_inode(inode);
+ return err;
+}
+
+int ubifs_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode, *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct qstr nm = { .name = name, .len = strlen(name) };
+ struct ubifs_dent_node *xent;
+ union ubifs_key key;
+ int err;
+
+ dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
+ host->i_ino, dentry->d_name.len, dentry->d_name.name);
+ ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+ err = check_namespace(&nm);
+ if (err < 0)
+ return err;
+
+ xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ xent_key_init(c, &key, host->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+ if (err) {
+ if (err == -ENOENT)
+ err = -ENODATA;
+ goto out_free;
+ }
+
+ inode = iget_xattr(c, le64_to_cpu(xent->inum));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_free;
+ }
+
+ ubifs_assert(inode->i_nlink == 1);
+ inode->i_nlink = 0;
+ err = remove_xattr(c, host, inode, &nm);
+ if (err)
+ inode->i_nlink = 1;
+
+ /* If @i_nlink is 0, 'iput()' will delete the inode */
+ iput(inode);
+
+out_free:
+ kfree(xent);
+ return err;
+}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5..b546ba69be8 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
if (len == 0)
return -ENOENT;
- slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+ slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
if (slots == NULL)
return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *alias;
int err, table;
- lock_kernel();
+ lock_super(sb);
table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
brelse(sinfo.bh);
if (IS_ERR(inode)) {
- unlock_kernel();
+ unlock_super(sb);
return ERR_CAST(inode);
}
alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
dput(alias);
else {
iput(inode);
- unlock_kernel();
+ unlock_super(sb);
return alias;
}
}
error:
- unlock_kernel();
+ unlock_super(sb);
dentry->d_op = &vfat_dentry_ops[table];
dentry->d_time = dentry->d_parent->d_inode->i_version;
dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
struct timespec ts;
int err;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_dir_empty(inode);
if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -791,10 +792,11 @@ out:
static int vfat_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(dir, &dentry->d_name, &sinfo);
if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, cluster;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct timespec ts;
loff_t dotdot_i_pos, new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
+ struct super_block *sb = old_dir->i_sb;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
if (err)
goto out;
@@ -951,7 +954,7 @@ out:
brelse(sinfo.bh);
brelse(dotdot_bh);
brelse(old_sinfo.bh);
- unlock_kernel();
+ unlock_super(sb);
return err;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0e..ad3d26ddfe3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- /* If I'm the only one writing to this iclog, sync it to disk */
- if (atomic_read(&iclog->ic_refcnt) == 1) {
+ /*
+ * If I'm the only one writing to this iclog, sync it to disk.
+ * We need to do an atomic compare and decrement here to avoid
+ * racing with concurrent atomic_dec_and_lock() calls in
+ * xlog_state_release_iclog() when there is more than one
+ * reference to the iclog.
+ */
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+ /* we are the only one */
spin_unlock(&log->l_icloglock);
- if ((error = xlog_state_release_iclog(log, iclog)))
+ error = xlog_state_release_iclog(log, iclog);
+ if (error)
return error;
} else {
- atomic_dec(&iclog->ic_refcnt);
spin_unlock(&log->l_icloglock);
}
goto restart;