Skip to content

Commit 5254d41

Browse files
fdmananakdave
authored andcommitted
btrfs: fix zero size inode with non-zero size after log replay
When logging that an inode exists, as part of logging a new name or logging new dir entries for a directory, we always set the generation of the logged inode item to 0. This is to signal during log replay (in overwrite_item()), that we should not set the i_size since we only logged that an inode exists, so the i_size of the inode in the subvolume tree must be preserved (as when we log new names or that an inode exists, we don't log extents). This works fine except when we have already logged an inode in full mode or it's the first time we are logging an inode created in a past transaction, that inode has a new i_size of 0 and then we log a new name for the inode (due to a new hardlink or a rename), in which case we log an i_size of 0 for the inode and a generation of 0, which causes the log replay code to not update the inode's i_size to 0 (in overwrite_item()). An example scenario: mkdir /mnt/dir xfs_io -f -c "pwrite 0 64K" /mnt/dir/foo sync xfs_io -c "truncate 0" -c "fsync" /mnt/dir/foo ln /mnt/dir/foo /mnt/dir/bar xfs_io -c "fsync" /mnt/dir <power fail> After log replay the file remains with a size of 64K. This is because when we first log the inode, when we fsync file foo, we log its current i_size of 0, and then when we create a hard link we log again the inode in exists mode (LOG_INODE_EXISTS) but we set a generation of 0 for the inode item we add to the log tree, so during log replay overwrite_item() sees that the generation is 0 and i_size is 0 so we skip updating the inode's i_size from 64K to 0. Fix this by making sure at fill_inode_item() we always log the real generation of the inode if it was logged in the current transaction with the i_size we logged before. Also if an inode created in a previous transaction is logged in exists mode only, make sure we log the i_size stored in the inode item located from the commit root, so that if we log multiple times that the inode exists we get the correct i_size. A test case for fstests will follow soon. Reported-by: Vyacheslav Kovalevsky <[email protected]> Link: https://lore.kernel.org/linux-btrfs/[email protected]/ Signed-off-by: Filipe Manana <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent b52fe51 commit 5254d41

1 file changed

Lines changed: 65 additions & 33 deletions

File tree

fs/btrfs/tree-log.c

Lines changed: 65 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4616,21 +4616,32 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
46164616
struct inode *inode, bool log_inode_only,
46174617
u64 logged_isize)
46184618
{
4619+
u64 gen = BTRFS_I(inode)->generation;
46194620
u64 flags;
46204621

46214622
if (log_inode_only) {
4622-
/* set the generation to zero so the recover code
4623-
* can tell the difference between an logging
4624-
* just to say 'this inode exists' and a logging
4625-
* to say 'update this inode with these values'
4623+
/*
4624+
* Set the generation to zero so the recover code can tell the
4625+
* difference between a logging just to say 'this inode exists'
4626+
* and a logging to say 'update this inode with these values'.
4627+
* But only if the inode was not already logged before.
4628+
* We access ->logged_trans directly since it was already set
4629+
* up in the call chain by btrfs_log_inode(), and data_race()
4630+
* to avoid false alerts from KCSAN and since it was set already
4631+
* and one can set it to 0 since that only happens on eviction
4632+
* and we are holding a ref on the inode.
46264633
*/
4627-
btrfs_set_inode_generation(leaf, item, 0);
4634+
ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0);
4635+
if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid)
4636+
gen = 0;
4637+
46284638
btrfs_set_inode_size(leaf, item, logged_isize);
46294639
} else {
4630-
btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
46314640
btrfs_set_inode_size(leaf, item, inode->i_size);
46324641
}
46334642

4643+
btrfs_set_inode_generation(leaf, item, gen);
4644+
46344645
btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
46354646
btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
46364647
btrfs_set_inode_mode(leaf, item, inode->i_mode);
@@ -5448,42 +5459,63 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
54485459
return 0;
54495460
}
54505461

5451-
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
5452-
struct btrfs_path *path, u64 *size_ret)
5462+
static int get_inode_size_to_log(struct btrfs_trans_handle *trans,
5463+
struct btrfs_inode *inode,
5464+
struct btrfs_path *path, u64 *size_ret)
54535465
{
54545466
struct btrfs_key key;
5467+
struct btrfs_inode_item *item;
54555468
int ret;
54565469

54575470
key.objectid = btrfs_ino(inode);
54585471
key.type = BTRFS_INODE_ITEM_KEY;
54595472
key.offset = 0;
54605473

5461-
ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5462-
if (ret < 0) {
5463-
return ret;
5464-
} else if (ret > 0) {
5465-
*size_ret = 0;
5466-
} else {
5467-
struct btrfs_inode_item *item;
5474+
/*
5475+
* Our caller called inode_logged(), so logged_trans is up to date.
5476+
* Use data_race() to silence any warning from KCSAN. Once logged_trans
5477+
* is set, it can only be reset to 0 after inode eviction.
5478+
*/
5479+
if (data_race(inode->logged_trans) == trans->transid) {
5480+
ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
5481+
} else if (inode->generation < trans->transid) {
5482+
path->search_commit_root = true;
5483+
path->skip_locking = true;
5484+
ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
5485+
path->search_commit_root = false;
5486+
path->skip_locking = false;
54685487

5469-
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5470-
struct btrfs_inode_item);
5471-
*size_ret = btrfs_inode_size(path->nodes[0], item);
5472-
/*
5473-
* If the in-memory inode's i_size is smaller then the inode
5474-
* size stored in the btree, return the inode's i_size, so
5475-
* that we get a correct inode size after replaying the log
5476-
* when before a power failure we had a shrinking truncate
5477-
* followed by addition of a new name (rename / new hard link).
5478-
* Otherwise return the inode size from the btree, to avoid
5479-
* data loss when replaying a log due to previously doing a
5480-
* write that expands the inode's size and logging a new name
5481-
* immediately after.
5482-
*/
5483-
if (*size_ret > inode->vfs_inode.i_size)
5484-
*size_ret = inode->vfs_inode.i_size;
5488+
} else {
5489+
*size_ret = 0;
5490+
return 0;
54855491
}
54865492

5493+
/*
5494+
* If the inode was logged before or is from a past transaction, then
5495+
* its inode item must exist in the log root or in the commit root.
5496+
*/
5497+
ASSERT(ret <= 0);
5498+
if (WARN_ON_ONCE(ret > 0))
5499+
ret = -ENOENT;
5500+
5501+
if (ret < 0)
5502+
return ret;
5503+
5504+
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5505+
struct btrfs_inode_item);
5506+
*size_ret = btrfs_inode_size(path->nodes[0], item);
5507+
/*
5508+
* If the in-memory inode's i_size is smaller then the inode size stored
5509+
* in the btree, return the inode's i_size, so that we get a correct
5510+
* inode size after replaying the log when before a power failure we had
5511+
* a shrinking truncate followed by addition of a new name (rename / new
5512+
* hard link). Otherwise return the inode size from the btree, to avoid
5513+
* data loss when replaying a log due to previously doing a write that
5514+
* expands the inode's size and logging a new name immediately after.
5515+
*/
5516+
if (*size_ret > inode->vfs_inode.i_size)
5517+
*size_ret = inode->vfs_inode.i_size;
5518+
54875519
btrfs_release_path(path);
54885520
return 0;
54895521
}
@@ -6996,7 +7028,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
69967028
ret = drop_inode_items(trans, log, path, inode,
69977029
BTRFS_XATTR_ITEM_KEY);
69987030
} else {
6999-
if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
7031+
if (inode_only == LOG_INODE_EXISTS) {
70007032
/*
70017033
* Make sure the new inode item we write to the log has
70027034
* the same isize as the current one (if it exists).
@@ -7010,7 +7042,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
70107042
* (zeroes), as if an expanding truncate happened,
70117043
* instead of getting a file of 4Kb only.
70127044
*/
7013-
ret = logged_inode_size(log, inode, path, &logged_isize);
7045+
ret = get_inode_size_to_log(trans, inode, path, &logged_isize);
70147046
if (ret)
70157047
goto out_unlock;
70167048
}

0 commit comments

Comments
 (0)