diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/async-thread.c | 2 | ||||
-rw-r--r-- | fs/btrfs/backref.c | 18 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 7 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 6 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.c | 3 | ||||
-rw-r--r-- | fs/btrfs/delayed-inode.h | 2 | ||||
-rw-r--r-- | fs/btrfs/dir-item.c | 10 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 1 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 7 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 85 | ||||
-rw-r--r-- | fs/btrfs/file.c | 18 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 56 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 212 | ||||
-rw-r--r-- | fs/btrfs/send.c | 16 | ||||
-rw-r--r-- | fs/btrfs/super.c | 35 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 8 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 2 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 363 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 1 | ||||
-rw-r--r-- | fs/btrfs/xattr.c | 154 |
21 files changed, 850 insertions, 158 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4dabeb893b7c..dcc1aae4c951 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -316,8 +316,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, list_add_tail(&work->ordered_list, &wq->ordered_list); spin_unlock_irqrestore(&wq->list_lock, flags); } - queue_work(wq->normal_wq, &work->normal_work); trace_btrfs_work_queued(work); + queue_work(wq->normal_wq, &work->normal_work); } void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2d3e32ebfd15..69e596b1f95b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1371,7 +1371,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, read_extent_buffer(eb, dest + bytes_left, name_off, name_len); if (eb != eb_in) { - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } ret = inode_ref_info(parent, 0, fs_root, path, &found_key); @@ -1390,9 +1391,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ if (eb != eb_in) { - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[0] = NULL; + path->locks[0] = 0; } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); @@ -1778,7 +1780,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, int found = 0; struct extent_buffer *eb; struct btrfs_inode_extref *extref; - struct extent_buffer *leaf; u32 item_size; u32 cur_offset; unsigned long ptr; @@ -1806,9 +1807,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, slot); - ptr = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; while (cur_offset < item_size) { @@ -1822,7 +1822,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, if (ret) break; - cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } btrfs_tree_read_unlock_blocking(eb); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c81ce0c7c1a9..39c68ef10808 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1542,6 +1542,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -2762,8 +2763,10 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; + } /* * must have write locks on this node and the @@ -2920,7 +2923,7 @@ done: */ if (!p->leave_spinning) btrfs_set_path_blocking(p); - if (ret < 0) + if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe69edda11fb..42d11e7cf7fe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -607,6 +607,7 @@ struct btrfs_path { unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; + unsigned int skip_release_on_error:1; }; /* @@ -3686,6 +3687,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, + int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3861,6 +3866,7 @@ extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); int btrfs_is_empty_uuid(u8 *uuid); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index de4e70fb3cbb..37848167c4b8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list) + struct list_head *ins_list, bool *emitted) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; + *emitted = true; } return 0; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f70119f25421..0167853c84ae 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list); + struct list_head *ins_list, bool *emitted); /* for init */ int __init btrfs_delayed_inode_init(void); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index fc8df866e919..1752625fb4dd 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,10 +21,6 @@ #include "hash.h" #include "transaction.h" -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); - /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len) +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1709831b1a1..5177954e1a2c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2418,6 +2418,7 @@ int open_ctree(struct super_block *sb, if (btrfs_check_super_csum(bh->b_data)) { printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; + brelse(bh); goto fail_alloc; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 950479f2d337..a067065efa6b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7237,7 +7237,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -9196,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, int ret = 0; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - list_del_init(&block_group->bg_list); if (ret) - continue; + goto next; spin_lock(&block_group->lock); memcpy(&item, &block_group->item, sizeof(item)); @@ -9213,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf3f424e0013..cb239ddae5c9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2767,7 +2767,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_end_io_t end_io_func, int mirror_num, unsigned long prev_bio_flags, - unsigned long bio_flags) + unsigned long bio_flags, + bool force_bio_submit) { int ret = 0; struct bio *bio; @@ -2785,6 +2786,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || + force_bio_submit || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, @@ -2876,7 +2878,8 @@ static int __do_readpage(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2924,6 +2927,7 @@ static int __do_readpage(struct extent_io_tree *tree, } while (cur <= end) { unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + bool force_bio_submit = false; if (cur >= last_byte) { char *userpage; @@ -2974,6 +2978,49 @@ static int __do_readpage(struct extent_io_tree *tree, block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; + + /* + * If we have a file range that points to a compressed extent + * and it's followed by a consecutive file range that points to + * to the same compressed extent (possibly with a different + * offset and/or length, so it either points to the whole extent + * or only part of it), we must make sure we do not submit a + * single bio to populate the pages for the 2 ranges because + * this makes the compressed extent read zero out the pages + * belonging to the 2nd range. Imagine the following scenario: + * + * File layout + * [0 - 8K] [8K - 24K] + * | | + * | | + * points to extent X, points to extent X, + * offset 4K, length of 8K offset 0, length 16K + * + * [extent X, compressed length = 4K uncompressed length = 16K] + * + * If the bio to read the compressed extent covers both ranges, + * it will decompress extent X into the pages belonging to the + * first range and then it will stop, zeroing out the remaining + * pages that belong to the other range that points to extent X. + * So here we make sure we submit 2 bios, one for the first + * range and another one for the third range. Both will target + * the same physical extent from disk, but we can't currently + * make the compressed bio endio callback populate the pages + * for both ranges because each compressed bio is tightly + * coupled with a single extent map, and each range can have + * an extent map with a different offset value relative to the + * uncompressed data of our extent and different lengths. This + * is a corner case so we prioritize correctness over + * non-optimal behavior (submitting 2 bios for the same extent). + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + prev_em_start && *prev_em_start != (u64)-1 && + *prev_em_start != em->orig_start) + force_bio_submit = true; + + if (prev_em_start) + *prev_em_start = em->orig_start; + free_extent_map(em); em = NULL; @@ -3023,7 +3070,8 @@ static int __do_readpage(struct extent_io_tree *tree, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, - this_bio_flag); + this_bio_flag, + force_bio_submit); if (!ret) { nr++; *bio_flags = this_bio_flag; @@ -3050,7 +3098,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { struct inode *inode; struct btrfs_ordered_extent *ordered; @@ -3070,7 +3119,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw); + mirror_num, bio_flags, rw, prev_em_start); page_cache_release(pages[index]); } } @@ -3080,7 +3129,8 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { u64 start = 0; u64 end = 0; @@ -3101,7 +3151,7 @@ static void __extent_readpages(struct extent_io_tree *tree, index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw); + rw, prev_em_start); start = page_start; end = start + PAGE_CACHE_SIZE - 1; first_index = index; @@ -3112,7 +3162,8 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw); + mirror_num, bio_flags, rw, + prev_em_start); } static int __extent_read_full_page(struct extent_io_tree *tree, @@ -3138,7 +3189,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw); + bio_flags, rw, NULL); return ret; } @@ -3164,7 +3215,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, int ret; ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, READ, NULL); if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; @@ -3417,7 +3468,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, - 0, 0, 0); + 0, 0, 0, false); if (ret) SetPageError(page); } @@ -3719,7 +3770,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = submit_extent_page(rw, tree, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, - 0, epd->bio_flags, bio_flags); + 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { set_btree_ioerr(p); @@ -4123,6 +4174,7 @@ int extent_readpages(struct extent_io_tree *tree, struct page *page; struct extent_map *em_cached = NULL; int nr = 0; + u64 prev_em_start = (u64)-1; for (page_idx = 0; page_idx < nr_pages; page_idx++) { page = list_entry(pages->prev, struct page, lru); @@ -4139,12 +4191,12 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ); + &bio, 0, &bio_flags, READ, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ); + &bio, 0, &bio_flags, READ, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -4484,8 +4536,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } ret = fiemap_fill_next_extent(fieinfo, em_start, disko, em_len, flags); - if (ret) + if (ret) { + if (ret == 1) + ret = 0; goto out_free; + } } out_free: free_extent_map(em); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5193c7844315..2ad4cb3da8f6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -760,8 +760,16 @@ next_slot: } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid > ino || - key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + + if (key.objectid > ino) + break; + if (WARN_ON_ONCE(key.objectid < ino) || + key.type < BTRFS_EXTENT_DATA_KEY) { + ASSERT(del_nr == 0); + path->slots[0]++; + goto next_slot; + } + if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) break; fi = btrfs_item_ptr(leaf, path->slots[0], @@ -780,8 +788,8 @@ next_slot: btrfs_file_extent_inline_len(leaf, path->slots[0], fi); } else { - WARN_ON(1); - extent_end = search_start; + /* can't happen */ + BUG(); } /* @@ -2789,7 +2797,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif }; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 83d646bd2e4b..ed2de833dd18 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -271,7 +271,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) __btrfs_add_free_space(ctl, info->offset, count); free: rb_erase(&info->offset_index, rbroot); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index edaa6178b4ec..ef677cdf777f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1268,8 +1268,14 @@ next_slot: num_bytes = 0; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid > ino || - found_key.type > BTRFS_EXTENT_DATA_KEY || + if (found_key.objectid > ino) + break; + if (WARN_ON_ONCE(found_key.objectid < ino) || + found_key.type < BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + goto next_slot; + } + if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; @@ -4802,7 +4808,8 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (!special_file(inode->i_mode)) + btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(inode, 0, (u64)-1); @@ -5334,6 +5341,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ + bool emitted; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5362,6 +5370,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; + emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5441,6 +5450,7 @@ skip: if (over) goto nopos; + emitted = true; di_len = btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; @@ -5453,11 +5463,20 @@ next: if (key_type == BTRFS_DIR_INDEX_KEY) { if (is_curr) ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); if (ret) goto nopos; } + /* + * If we haven't emitted any dir entry, we must not touch ctx->pos as + * it was was set to the termination value in previous call. We assume + * that "." and ".." were emitted if we reach this point and set the + * termination value as well for an empty directory. + */ + if (ctx->pos > 2 && !emitted) + goto nopos; + /* Reached end of directory/root. Bump pos past the last item. */ ctx->pos++; @@ -6073,7 +6092,7 @@ out_unlock_inode: static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = old_dentry->d_inode; u64 index; @@ -6099,6 +6118,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; goto fail; } @@ -6132,9 +6152,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); fail: + if (trans) + btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -8034,15 +8055,28 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - + struct inode *inode = page->mapping->host; + int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } + + /* + * If we are under memory pressure we will call this directly from the + * VM, we need to make sure we have the inode referenced for the ordered + * extent. If not just return like we didn't do anything. + */ + if (!igrab(inode)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + btrfs_add_delayed_iput(inode); + return ret; } static int btrfs_writepages(struct address_space *mapping, @@ -9118,9 +9152,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, /* * 2 items for inode item and ref * 2 items for dir items + * 1 item for updating parent inode item + * 1 item for the inline extent item * 1 item for xattr if selinux is on */ - trans = btrfs_start_transaction(root, 5); + trans = btrfs_start_transaction(root, 7); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -9477,7 +9513,7 @@ static const struct file_operations btrfs_dir_file_operations = { .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fce3b5b9a2bb..d96b2bc444c8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1656,7 +1656,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - btrfs_info(BTRFS_I(src_inode)->root->fs_info, + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { @@ -2434,8 +2434,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_inode; } - d_invalidate(dentry); - down_write(&root->fs_info->subvol_sem); err = may_destroy_subvol(dest); @@ -2529,7 +2527,7 @@ out_up_write: out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { - shrink_dcache_sb(root->fs_info->sb); + d_invalidate(dentry); btrfs_invalidate_inodes(dest); d_delete(dentry); ASSERT(dest->send_in_progress == 0); @@ -2961,7 +2959,7 @@ out_unlock: static long btrfs_ioctl_file_extent_same(struct file *file, struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_args *same = NULL; struct btrfs_ioctl_same_extent_info *info; struct inode *src = file_inode(file); u64 off; @@ -2991,6 +2989,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (IS_ERR(same)) { ret = PTR_ERR(same); + same = NULL; goto out; } @@ -3061,6 +3060,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, out: mnt_drop_write_file(file); + kfree(same); return ret; } @@ -3187,6 +3187,150 @@ static void clone_update_extent_map(struct inode *inode, &BTRFS_I(inode)->runtime_flags); } +/* + * Make sure we do not end up inserting an inline extent into a file that has + * already other (non-inline) extents. If a file has an inline extent it can + * not have any other extents and the (single) inline extent must start at the + * file offset 0. Failing to respect these rules will lead to file corruption, + * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc + * + * We can have extents that have been already written to disk or we can have + * dirty ranges still in delalloc, in which case the extent maps and items are + * created only when we run delalloc, and the delalloc ranges might fall outside + * the range we are currently locking in the inode's io tree. So we check the + * inode's i_size because of that (i_size updates are done while holding the + * i_mutex, which we are holding here). + * We also check to see if the inode has a size not greater than "datal" but has + * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are + * protected against such concurrent fallocate calls by the i_mutex). + * + * If the file has no extents but a size greater than datal, do not allow the + * copy because we would need turn the inline extent into a non-inline one (even + * with NO_HOLES enabled). If we find our destination inode only has one inline + * extent, just overwrite it with the source inline extent if its size is less + * than the source extent's size, or we could copy the source inline extent's + * data into the destination inode's inline extent if the later is greater then + * the former. + */ +static int clone_copy_inline_extent(struct inode *src, + struct inode *dst, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 skip, + const u64 size, + char *inline_data) +{ + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + root->sectorsize); + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) + return -EOPNOTSUPP; + + key.objectid = btrfs_ino(dst); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) { + ASSERT(key.offset > 0); + return -EOPNOTSUPP; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + u64 ext_len; + + /* + * If the file size is <= datal, make sure there are no other + * extents following (can happen do to an fallocate call with + * the flag FALLOC_FL_KEEP_SIZE). + */ + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent, it can not have other extents + * following it. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + if (ext_len > aligned_end) + return -EOPNOTSUPP; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + return ret; + } else if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) + return -EOPNOTSUPP; + } + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * If the destination inode has an inline extent... + * This would require copying the data from the source inline + * extent into the beginning of the destination's inline extent. + * But this is really complex, both extents can be compressed + * or just one of them, which would require decompressing and + * re-compressing data (which could increase the new compressed + * size, not allowing the compressed data to fit anymore in an + * inline extent). + * So just don't support this case for now (it should be rare, + * we are not really saving space when cloning inline extents). + */ + return -EOPNOTSUPP; + } + + btrfs_release_path(path); + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + return ret; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + return ret; + + if (skip) { + const u32 start = btrfs_file_extent_calc_inline_size(0); + + memmove(inline_data + start, inline_data + start + skip, datal); + } + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + + return 0; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -3451,7 +3595,6 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; - u64 aligned_end = 0; if (off > key.offset) { skip = off - key.offset; @@ -3469,42 +3612,22 @@ process_slot: size -= skip + trim; datal -= skip + trim; - aligned_end = ALIGN(new_key.offset + datal, - root->sectorsize); - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - aligned_end, - 1); + ret = clone_copy_inline_extent(src, inode, + trans, path, + &new_key, + drop_start, + datal, + skip, size, buf); if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); + root, + ret); btrfs_end_transaction(trans, root); goto out; } - - if (skip) { - u32 start = - btrfs_file_extent_calc_inline_size(0); - memmove(buf+start, buf+start+skip, - datal); - } - leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - inode_add_bytes(inode, datal); } /* If we have an implicit hole (NO_HOLES feature). */ @@ -5372,3 +5495,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 874828dd0a86..3cc2d1dfd7bf 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1461,7 +1461,21 @@ static int read_symlink(struct btrfs_root *root, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret); + if (ret) { + /* + * An empty symlink inode. Can happen in rare error paths when + * creating a symlink (transaction committed before the inode + * eviction handler removed the symlink inode items and a crash + * happened in between or the subvol was snapshoted in between). + * Print an informative message to dmesg/syslog so that the user + * can delete the symlink. + */ + btrfs_err(root->fs_info, + "Found empty symlink inode %llu at root %llu", + ino, root->root_key.objectid); + ret = -EIO; + goto out; + } ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cde9c03e3913..cf6d11bb8dcb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used) { + if (!trans->dirty && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -901,6 +901,15 @@ find_root: if (IS_ERR(new_root)) return ERR_CAST(new_root); + if (!(sb->s_flags & MS_RDONLY)) { + int ret; + down_read(&fs_info->cleanup_work_sem); + ret = btrfs_orphan_cleanup(new_root); + up_read(&fs_info->cleanup_work_sem); + if (ret) + return ERR_PTR(ret); + } + dir_id = btrfs_root_dirid(&new_root->root_item); setup_root: location.objectid = dir_id; @@ -1766,6 +1775,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * there are other factors that may change the result (like a new metadata * chunk). * + * If metadata is exhausted, f_bavail will be 0. + * * FIXME: not accurate for mixed block groups, total and free/used are ok, * available appears slightly larger. */ @@ -1777,11 +1788,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; + u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; + u64 thresh = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -1809,6 +1822,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + total_free_meta += found->disk_total - found->disk_used; total_used += found->disk_used; } @@ -1836,6 +1851,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = 4 * 1024 * 1024; + + if (total_free_meta - thresh < block_rsv->size) + buf->f_bavail = 0; + buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = BTRFS_NAME_LEN; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 63c6d05950f2..30dbf315c2d6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -472,7 +472,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->blocks_used = 0; h->bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; @@ -723,7 +722,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->ordered)) { spin_lock(&info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); spin_unlock(&info->trans_lock); } @@ -1732,7 +1731,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1756,8 +1755,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); + ret = prev_trans->aborted; btrfs_put_transaction(prev_trans); + if (ret) + goto cleanup_transaction; } else { spin_unlock(&root->fs_info->trans_lock); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1ba9c3e04191..1cf5de30368a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -88,7 +88,6 @@ struct btrfs_trans_handle { u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; - unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; @@ -98,6 +97,7 @@ struct btrfs_trans_handle { bool allocating_chunk; bool reloc_reserved; bool sync; + bool dirty; unsigned int type; /* * this root is only needed to validate that the root passed to diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3b68c75eccea..f6c20cf6090e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3929,6 +3929,308 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) <crash/power failure> + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * <power fail> + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * <power fail> + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * <power fail> + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -3967,6 +4269,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4056,11 +4359,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4083,6 +4381,44 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4150,9 +4486,26 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d47289c715c8..25df49239ceb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -162,6 +162,7 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 00eacd83ce3d..fbb0533e977f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,6 +29,7 @@ #include "xattr.h" #include "disk-io.h" #include "props.h" +#include "locking.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct btrfs_dir_item *di; + struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; size_t name_len = strlen(name); @@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->skip_release_on_error = 1; + + if (!value) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (!di && (flags & XATTR_REPLACE)) + ret = -ENODATA; + else if (di) + ret = btrfs_delete_one_dir_name(trans, root, path, di); + goto out; + } + /* + * For a replace we can't just do the insert blindly. + * Do a lookup first (read-only btrfs_search_slot), and return if xattr + * doesn't exist. If it exists, fall down below to the insert/replace + * path - we can't race with a concurrent xattr delete, because the VFS + * locks the inode's i_mutex before calling setxattr or removexattr. + */ if (flags & XATTR_REPLACE) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { + ASSERT(mutex_is_locked(&inode->i_mutex)); + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (!di) { ret = -ENODATA; goto out; } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; btrfs_release_path(path); + di = NULL; + } + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EOVERFLOW) { /* - * remove the attribute + * We have an existing item in a leaf, split_leaf couldn't + * expand it. That item might have or not a dir_item that + * matches our target xattr, so lets check. */ - if (!value) - goto out; - } else { - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), - name, name_len, 0); - if (IS_ERR(di)) { - ret = PTR_ERR(di); + ret = 0; + btrfs_assert_tree_locked(path->nodes[0]); + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (!di && !(flags & XATTR_REPLACE)) { + ret = -ENOSPC; goto out; } - if (!di && !value) - goto out; - btrfs_release_path(path); + } else if (ret == -EEXIST) { + ret = 0; + di = btrfs_match_dir_item_name(root, path, name, name_len); + ASSERT(di); /* logic error */ + } else if (ret) { + goto out; } -again: - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - /* - * If we're setting an xattr to a new value but the new value is say - * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting - * back from split_leaf. This is because it thinks we'll be extending - * the existing item size, but we're asking for enough space to add the - * item itself. So if we get EOVERFLOW just set ret to EEXIST and let - * the rest of the function figure it out. - */ - if (ret == -EOVERFLOW) + if (di && (flags & XATTR_CREATE)) { ret = -EEXIST; + goto out; + } - if (ret == -EEXIST) { - if (flags & XATTR_CREATE) - goto out; + if (di) { /* - * We can't use the path we already have since we won't have the - * proper locking for a delete, so release the path and - * re-lookup to delete the thing. + * We're doing a replace, and it must be atomic, that is, at + * any point in time we have either the old or the new xattr + * value in the tree. We don't want readers (getxattr and + * listxattrs) to miss a value, this is specially important + * for ACLs. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { - /* Shouldn't happen but just in case... */ - btrfs_release_path(path); - goto again; + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); + const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; + struct btrfs_item *item; + unsigned long data_ptr; + char *ptr; + + if (size > old_data_len) { + if (btrfs_leaf_free_space(root, leaf) < + (size - old_data_len)) { + ret = -ENOSPC; + goto out; + } } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + if (old_data_len + name_len + sizeof(*di) == item_size) { + /* No other xattrs packed in the same leaf item. */ + if (size > old_data_len) + btrfs_extend_item(root, path, + size - old_data_len); + else if (size < old_data_len) + btrfs_truncate_item(root, path, data_size, 1); + } else { + /* There are other xattrs packed in the same item. */ + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_extend_item(root, path, data_size); + } + item = btrfs_item_nr(slot); + ptr = btrfs_item_ptr(leaf, slot, char); + ptr += btrfs_item_size(leaf, item) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; + write_extent_buffer(leaf, value, data_ptr, size); + btrfs_mark_buffer_dirty(leaf); + } else { /* - * We have a value to set, so go back and try to insert it now. + * Insert, and we had space for the xattr, so path->slots[0] is + * where our xattr dir_item is and btrfs_insert_xattr_item() + * filled it. */ - if (value) { - btrfs_release_path(path); - goto again; - } } out: btrfs_free_path(path); @@ -273,8 +309,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; - if (found_key.type != BTRFS_XATTR_ITEM_KEY) + if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; + if (found_key.type < BTRFS_XATTR_ITEM_KEY) + goto next; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(root, leaf, di)) |