From e3014d14a81edde488d9a6758eea8afc41752d2d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:38:11 -0400 Subject: ext4: fixup free space calculations when expanding inodes Conditions checking whether there is enough free space in an xattr block and when xattr is large enough to make enough space in the inode forgot to account for the fact that inode need not be completely filled up with xattrs. Thus we could move unnecessarily many xattrs out of inode or even falsely claim there is not enough space to expand the inode. We also forgot to update the amount of free space in xattr block when moving more xattrs and thus could decide to move too big xattr resulting in unexpected failure. Fix these problems by properly updating free space in the inode and xattr block as we move xattrs. To simplify the math, avoid shifting xattrs after removing each one xattr and instead just shift xattrs only once there is enough free space in the inode. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 58 ++++++++++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2eb935ca5d9e..22d2ebcd1f09 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1350,7 +1350,8 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; - size_t min_offs, free; + size_t min_offs; + size_t ifree, bfree; int total_ino; void *base, *start, *end; int error = 0, tried_min_extra_isize = 0; @@ -1385,17 +1386,9 @@ retry: if (error) goto cleanup; - free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= isize_diff) { - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - new_extra_isize, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, - inode->i_sb->s_blocksize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - goto out; - } + ifree = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (ifree >= isize_diff) + goto shift; /* * Enough free space isn't available in the inode, check if @@ -1416,8 +1409,8 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, NULL); - if (free < isize_diff) { + bfree = ext4_xattr_free_space(first, &min_offs, base, NULL); + if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; @@ -1428,10 +1421,10 @@ retry: goto cleanup; } } else { - free = inode->i_sb->s_blocksize; + bfree = inode->i_sb->s_blocksize; } - while (isize_diff > 0) { + while (isize_diff > ifree) { size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; struct ext4_xattr_info i = { @@ -1439,7 +1432,6 @@ retry: .value_len = 0, }; unsigned int total_size; /* EA entry size + value size */ - unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ unsigned int min_total_size = ~0U; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); @@ -1461,8 +1453,9 @@ retry: total_size = EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= free && total_size < min_total_size) { - if (total_size < isize_diff) { + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; @@ -1491,6 +1484,7 @@ retry: offs = le16_to_cpu(entry->e_value_offs); size = le32_to_cpu(entry->e_value_size); entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + EXT4_XATTR_SIZE(size); i.name_index = entry->e_name_index, buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); @@ -1518,21 +1512,8 @@ retry: if (error) goto cleanup; total_ino -= entry_size; - - entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff) - shift_bytes = isize_diff; - else - shift_bytes = entry_size + EXT4_XATTR_SIZE(size); - /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, -shift_bytes, - (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + - EXT4_I(inode)->i_extra_isize + shift_bytes, - (void *)header, total_ino, inode->i_sb->s_blocksize); - - isize_diff -= shift_bytes; - EXT4_I(inode)->i_extra_isize += shift_bytes; - header = IHDR(inode, raw_inode); + ifree += total_size; + bfree -= total_size; i.name = b_entry_name; i.value = buffer; @@ -1553,6 +1534,15 @@ retry: kfree(is); kfree(bs); } + +shift: + /* Adjust the offsets and shift the remaining entries ahead */ + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); -- cgit 1.4.1 From 2de58f1102cf1ac6091209e1dfa8eccbcb039570 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:39:11 -0400 Subject: ext4: Check that external xattr value block is zero Currently we don't support xattrs with values stored out of line. Check for that in ext4_xattr_check_names() to make sure we never work with such xattrs since not all the code counts with that resulting is possible weird corruption issues. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 22d2ebcd1f09..f845cb7c6623 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -199,6 +199,8 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, } while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_block != 0) + return -EFSCORRUPTED; if (entry->e_value_size != 0 && (value_start + le16_to_cpu(entry->e_value_offs) < (void *)e + sizeof(__u32) || -- cgit 1.4.1 From 1cba423707b47886391c7fcb3614fc67394be06e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:40:11 -0400 Subject: ext4: remove checks for e_value_block Currently we don't support xattrs with e_value_block set. We don't allow them to pass initial xattr check so there's no point for checking for this later. Since these tests were untested, bugs were creeping in and not all places which should have checked were checking e_value_block anyway. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index f845cb7c6623..1447860b61ec 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -643,7 +643,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; @@ -663,7 +663,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; @@ -671,7 +671,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) } free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { size_t size = le32_to_cpu(s->here->e_value_size); free += EXT4_XATTR_SIZE(size); } @@ -693,7 +693,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) s->here->e_name_len = name_len; memcpy(s->here->e_name, i->name, name_len); } else { - if (!s->here->e_value_block && s->here->e_value_size) { + if (s->here->e_value_size) { void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(s->here->e_value_offs); void *val = s->base + offs; @@ -727,8 +727,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) + if (last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + size); last = EXT4_XATTR_NEXT(last); @@ -1327,7 +1326,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { + if (last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; BUG_ON(new_offs + le32_to_cpu(last->e_value_size) @@ -1726,7 +1725,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, *name++; } - if (entry->e_value_block == 0 && entry->e_value_size != 0) { + if (entry->e_value_size != 0) { __le32 *value = (__le32 *)((char *)header + le16_to_cpu(entry->e_value_offs)); for (n = (le32_to_cpu(entry->e_value_size) + -- cgit 1.4.1 From 94405713889d4a9d341b4ad92956e4e2ec8ec2c2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:41:11 -0400 Subject: ext4: replace bogus assertion in ext4_xattr_shift_entries() We were checking whether computed offsets do not exceed end of block in ext4_xattr_shift_entries(). However this does not make sense since we always only decrease offsets. So replace that assertion with a check whether we really decrease xattrs value offsets. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1447860b61ec..82b025c977fc 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1319,18 +1319,19 @@ retry: */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, - void *from, size_t n, int blocksize) + void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; + /* We always shift xattr headers further thus offsets get lower */ + BUG_ON(value_offs_shift > 0); + /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; - BUG_ON(new_offs + le32_to_cpu(last->e_value_size) - > blocksize); last->e_value_offs = cpu_to_le16(new_offs); } } @@ -1542,7 +1543,7 @@ shift: ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, inode->i_sb->s_blocksize); + (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; brelse(bh); out: -- cgit 1.4.1 From 3f2571c1f91f2de729562344b4956786a2c74d73 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:42:11 -0400 Subject: ext4: factor out xattr moving Factor out function for moving xattrs from inode into external xattr block from ext4_expand_extra_isize_ea(). That function is already quite long and factoring out this rather standalone functionality helps readability. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 159 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 85 insertions(+), 74 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 82b025c977fc..8f582ae1032d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1339,6 +1339,84 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, memmove(to, from, n); } +/* + * Move xattr pointed to by 'entry' from inode into external xattr block + */ +static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t value_offs, value_size; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + .name_index = entry->e_name_index, + }; + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int error; + + value_offs = le16_to_cpu(entry->e_value_offs); + value_size = le32_to_cpu(entry->e_value_size); + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + buffer = kmalloc(value_size, GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!is || !bs || !buffer || !b_entry_name) { + error = -ENOMEM; + goto out; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto out; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto out; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto out; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = value_size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto out; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto out; + error = 0; +out: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + + return error; +} + /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. @@ -1349,9 +1427,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry, *last, *first; struct buffer_head *bh = NULL; - struct ext4_xattr_ibody_find *is = NULL; - struct ext4_xattr_block_find *bs = NULL; - char *buffer = NULL, *b_entry_name = NULL; size_t min_offs; size_t ifree, bfree; int total_ino; @@ -1427,27 +1502,11 @@ retry: } while (isize_diff > ifree) { - size_t offs, size, entry_size; struct ext4_xattr_entry *small_entry = NULL; - struct ext4_xattr_info i = { - .value = NULL, - .value_len = 0, - }; - unsigned int total_size; /* EA entry size + value size */ + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size = ~0U; - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); - bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - if (!is || !bs) { - error = -ENOMEM; - goto cleanup; - } - - is->s.not_found = -ENODATA; - bs->s.not_found = -ENODATA; - is->iloc.bh = NULL; - bs->bh = NULL; - last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ entry = NULL; @@ -1474,8 +1533,6 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; - kfree(is); is = NULL; - kfree(bs); bs = NULL; brelse(bh); goto retry; } @@ -1483,58 +1540,18 @@ retry: goto cleanup; } } - offs = le16_to_cpu(entry->e_value_offs); - size = le32_to_cpu(entry->e_value_size); - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + EXT4_XATTR_SIZE(size); - i.name_index = entry->e_name_index, - buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); - b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!buffer || !b_entry_name) { - error = -ENOMEM; - goto cleanup; - } - /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + offs, - EXT4_XATTR_SIZE(size)); - memcpy(b_entry_name, entry->e_name, entry->e_name_len); - b_entry_name[entry->e_name_len] = '\0'; - i.name = b_entry_name; - - error = ext4_get_inode_loc(inode, &is->iloc); - if (error) - goto cleanup; - error = ext4_xattr_ibody_find(inode, &i, is); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); if (error) goto cleanup; - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto cleanup; total_ino -= entry_size; ifree += total_size; bfree -= total_size; - - i.name = b_entry_name; - i.value = buffer; - i.value_len = size; - error = ext4_xattr_block_find(inode, &i, bs); - if (error) - goto cleanup; - - /* Add entry which was removed from the inode into the block */ - error = ext4_xattr_block_set(handle, inode, &i, bs); - if (error) - goto cleanup; - kfree(b_entry_name); - kfree(buffer); - b_entry_name = NULL; - buffer = NULL; - brelse(is->iloc.bh); - kfree(is); - kfree(bs); } shift: @@ -1552,12 +1569,6 @@ out: return 0; cleanup: - kfree(b_entry_name); - kfree(buffer); - if (is) - brelse(is->iloc.bh); - kfree(is); - kfree(bs); brelse(bh); /* * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode -- cgit 1.4.1 From 6e0cd088c01023c02b1e887e02c8b6f3f395344f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:43:11 -0400 Subject: ext4: remove (almost) unused variables from ext4_expand_extra_isize_ea() 'start' variable is completely unused in ext4_expand_extra_isize_ea(). Variable 'first' is used only once in one place. So just remove them. Variables 'entry' and 'last' are only really used later in the function inside a loop. Move their declarations there. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8f582ae1032d..2ef687620205 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1425,12 +1425,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry, *last, *first; struct buffer_head *bh = NULL; size_t min_offs; size_t ifree, bfree; int total_ino; - void *base, *start, *end; + void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ @@ -1446,24 +1445,22 @@ retry: goto out; header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ - base = start = entry; + base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - last = entry; total_ino = sizeof(struct ext4_xattr_ibody_header); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - ifree = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; @@ -1483,10 +1480,10 @@ retry: goto cleanup; } base = BHDR(bh); - first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - bfree = ext4_xattr_free_space(first, &min_offs, base, NULL); + bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, + NULL); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; @@ -1502,14 +1499,14 @@ retry: } while (isize_diff > ifree) { - struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_entry *small_entry = NULL, *entry = NULL; + struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ - entry = NULL; for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { total_size = EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + @@ -1556,8 +1553,7 @@ retry: shift: /* Adjust the offsets and shift the remaining entries ahead */ - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); -- cgit 1.4.1 From dfa2064b222c901b05c19ec5b7f42a25f7bee0e3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 29 Aug 2016 15:44:11 -0400 Subject: ext4: factor out loop for freeing inode xattr space Move loop to make enough space in the inode from ext4_expand_extra_isize_ea() into a separate function to make that function smaller and better readable and also to avoid delaration of variables inside a loop block. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 121 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 69 insertions(+), 52 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2ef687620205..c15d63389957 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1417,6 +1417,63 @@ out: return error; } +static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, + struct ext4_inode *raw_inode, + int isize_diff, size_t ifree, + size_t bfree, int *total_ino) +{ + struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + struct ext4_xattr_entry *small_entry; + struct ext4_xattr_entry *entry; + struct ext4_xattr_entry *last; + unsigned int entry_size; /* EA entry size */ + unsigned int total_size; /* EA entry size + value size */ + unsigned int min_total_size; + int error; + + while (isize_diff > ifree) { + entry = NULL; + small_entry = NULL; + min_total_size = ~0U; + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= bfree && + total_size < min_total_size) { + if (total_size + ifree < isize_diff) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry == NULL) + return -ENOSPC; + entry = small_entry; + } + + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + total_size = entry_size + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); + error = ext4_xattr_move_to_block(handle, inode, raw_inode, + entry); + if (error) + return error; + + *total_ino -= entry_size; + ifree += total_size; + bfree -= total_size; + } + + return 0; +} + /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. @@ -1491,66 +1548,26 @@ retry: brelse(bh); goto retry; } - error = -1; + error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } - while (isize_diff > ifree) { - struct ext4_xattr_entry *small_entry = NULL, *entry = NULL; - struct ext4_xattr_entry *last; - unsigned int entry_size; /* EA entry size */ - unsigned int total_size; /* EA entry size + value size */ - unsigned int min_total_size = ~0U; - - last = IFIRST(header); - /* Find the entry best suited to be pushed into EA block */ - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= bfree && - total_size < min_total_size) { - if (total_size + ifree < isize_diff) { - small_entry = last; - } else { - entry = last; - min_total_size = total_size; - } - } - } - - if (entry == NULL) { - if (small_entry) { - entry = small_entry; - } else { - if (!tried_min_extra_isize && - s_min_extra_isize) { - tried_min_extra_isize++; - new_extra_isize = s_min_extra_isize; - brelse(bh); - goto retry; - } - error = -1; - goto cleanup; - } + error = ext4_xattr_make_inode_space(handle, inode, raw_inode, + isize_diff, ifree, bfree, + &total_ino); + if (error) { + if (error == -ENOSPC && !tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; } - - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - total_size = entry_size + - EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)); - error = ext4_xattr_move_to_block(handle, inode, raw_inode, - entry); - if (error) - goto cleanup; - - total_ino -= entry_size; - ifree += total_size; - bfree -= total_size; + goto cleanup; } - shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize -- cgit 1.4.1 From 14fbd4aa613bd5110556c281799ce36dc6f3ba97 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 29 Aug 2016 15:45:11 -0400 Subject: ext4: enforce online defrag restriction for encrypted files Online defragging of encrypted files is not currently implemented. However, the move extent ioctl can still return successfully when called. For example, this occurs when xfstest ext4/020 is run on an encrypted file system, resulting in a corrupted test file and a corresponding test failure. Until the proper functionality is implemented, fail the move extent ioctl if either the original or donor file is encrypted. Cc: stable@vger.kernel.org Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a920c5d29fac..6fc14def0c70 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -598,6 +598,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, return -EOPNOTSUPP; } + if (ext4_encrypted_inode(orig_inode) || + ext4_encrypted_inode(donor_inode)) { + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported for encrypted files"); + return -EOPNOTSUPP; + } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); -- cgit 1.4.1 From 93e3b4e6631d2a74a8cf7429138096862ff9f452 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 5 Sep 2016 22:56:10 -0400 Subject: ext4: reinforce check of i_dtime when clearing high fields of uid and gid Now, ext4_do_update_inode() clears high 16-bit fields of uid/gid of deleted and evicted inode to fix up interoperability with old kernels. However, it checks only i_dtime of an inode to determine whether the inode was deleted and evicted, and this is very risky, because i_dtime can be used for the pointer maintaining orphan inode list, too. We need to further check whether the i_dtime is being used for the orphan inode list even if the i_dtime is not NULL. We found that high 16-bit fields of uid/gid of inode are unintentionally and permanently cleared when the inode truncation is just triggered, but not finished, and the inode metadata, whose high uid/gid bits are cleared, is written on disk, and the sudden power-off follows that in order. Cc: stable@vger.kernel.org Signed-off-by: Daeho Jeong Signed-off-by: Hobin Woo Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6ea25a190f8..15d4b4182ac7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4814,14 +4814,14 @@ static int ext4_do_update_inode(handle_t *handle, * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if (!ei->i_dtime) { + if (ei->i_dtime && list_empty(&ei->i_orphan)) { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); -- cgit 1.4.1 From 49da939272f417ff94c40f132a308748b46efe68 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 5 Sep 2016 23:08:16 -0400 Subject: ext4: enable quota enforcement based on mount options When quota information is stored in quota files, we enable only quota accounting on mount and enforcement is enabled only in response to Q_QUOTAON quotactl. To make ext4 behavior consistent with XFS, we add a possibility to enable quota enforcement on mount by specifying corresponding quota mount option (usrquota, grpquota, prjquota). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 12 +++++++++--- fs/ext4/super.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 13 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ea31931386ec..0c2bf4444548 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1117,9 +1117,15 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden + * quota files */ +#define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable + * enforcement for hidden quota + * files */ +#define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota + * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ec8708989ca..5819b0e8ff66 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1267,7 +1267,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, @@ -1327,6 +1327,7 @@ static const match_table_t tokens = { {Opt_noquota, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, + {Opt_prjquota, "prjquota"}, {Opt_barrier, "barrier=%u"}, {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, @@ -1546,8 +1547,11 @@ static const struct mount_opts { MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, + {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, + MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | - EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), + MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_offusrjquota, 0, MOPT_Q}, @@ -1836,13 +1840,17 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && - (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { - ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota " - "mount options ignored."); - clear_opt(sb, USRQUOTA); - clear_opt(sb, GRPQUOTA); - } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { + ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return 0; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) clear_opt(sb, USRQUOTA); @@ -5250,12 +5258,18 @@ static int ext4_enable_quotas(struct super_block *sb) le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; + bool quota_mopt[EXT4_MAXQUOTAS] = { + test_opt(sb, USRQUOTA), + test_opt(sb, GRPQUOTA), + test_opt(sb, PRJQUOTA), + }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, - DQUOT_USAGE_ENABLED); + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " -- cgit 1.4.1 From 0b7b77791cc1d99cbca08d5bc8210e57e6165612 Mon Sep 17 00:00:00 2001 From: Kaho Ng Date: Mon, 5 Sep 2016 23:11:58 -0400 Subject: ext4: remove old feature helpers Use the ext4_{has,set,clear}_feature_* helpers to replace the old feature helpers. Signed-off-by: Kaho Ng Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong --- fs/ext4/ext4.h | 20 -------------------- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 7 +++---- fs/ext4/ioctl.c | 6 ++---- 4 files changed, 6 insertions(+), 29 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0c2bf4444548..962a0444dc84 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1642,26 +1642,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * Feature set definitions */ -/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */ -#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) -#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9e66cd1d7b78..170421edfdfe 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -802,7 +802,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, } else inode_init_owner(inode, dir, mode); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 15d4b4182ac7..f058afbc4c46 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4414,7 +4414,7 @@ static inline void ext4_iget_extra_inode(struct inode *inode, int ext4_get_projid(struct inode *inode, kprojid_t *projid) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; @@ -4481,7 +4481,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); @@ -4885,8 +4885,7 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT) && + BUG_ON(!ext4_has_feature_project(inode->i_sb) && i_projid != EXT4_DEF_PROJID); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 10686fd67fb4..ef4430b2f04a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -310,8 +310,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else @@ -842,8 +841,7 @@ resizefs_out: ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (ext4_has_feature_project(inode->i_sb)) { fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, EXT4_I(inode)->i_projid); } -- cgit 1.4.1 From 6ae4c5a69877e5afb6ce63f7d82432133934073a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 5 Sep 2016 23:21:43 -0400 Subject: ext4: cleanup ext4_sync_parent() A condition !hlist_empty(&inode->i_dentry) is always true for open file. Just remove it. Also ext4_sync_parent() could use some explanation why races with rmdir() are not an issue - add a comment explaining that. Reported-by: Al Viro Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/fsync.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5c4372512ef7..88effb1053c7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -61,6 +61,13 @@ static int ext4_sync_parent(struct inode *inode) break; iput(inode); inode = next; + /* + * The directory inode may have gone through rmdir by now. But + * the inode itself and its blocks are still allocated (we hold + * a reference to the inode so it didn't go through + * ext4_evict_inode()) and so we are safe to flush metadata + * blocks and the inode. + */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; @@ -107,7 +114,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!journal) { ret = __generic_file_fsync(file, start, end, datasync); - if (!ret && !hlist_empty(&inode->i_dentry)) + if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) goto issue_flush; -- cgit 1.4.1 From e22834f0248d0fa841ead6436d6c19f65539dc9c Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 5 Sep 2016 23:38:36 -0400 Subject: ext4: improve ext4lazyinit scalability ext4lazyinit is a global thread. This thread performs itable initalization under li_list_mtx mutex. It basically does the following: ext4_lazyinit_thread ->mutex_lock(&eli->li_list_mtx); ->ext4_run_li_request(elr) ->ext4_init_inode_table-> Do a lot of IO if the list is large And when new mount/umount arrive they have to block on ->li_list_mtx because lazy_thread holds it during full walk procedure. ext4_fill_super ->ext4_register_li_request ->mutex_lock(&ext4_li_info->li_list_mtx); ->list_add(&elr->lr_request, &ext4_li_info >li_request_list); In my case mount takes 40minutes on server with 36 * 4Tb HDD. Common user may face this in case of very slow dev ( /dev/mmcblkXXX) Even more. If one of filesystems was frozen lazyinit_thread will simply block on sb_start_write() so other mount/umount will be stuck forever. This patch changes logic like follows: - grab ->s_umount read sem before processing new li_request. After that it is safe to drop li_list_mtx because all callers of li_remove_request are holding ->s_umount for write. - li_thread skips frozen SB's Locking order: Mh KOrder is asserted by umount path like follows: s_umount ->li_list_mtx so the only way to to grab ->s_mount inside li_thread is via down_read_trylock xfstests:ext4/023 #PSBM-49658 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5819b0e8ff66..50912cc5fb96 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2749,7 +2749,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; - sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2776,8 +2775,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } - sb_end_write(sb); - return ret; } @@ -2842,19 +2839,43 @@ cont_thread: mutex_unlock(&eli->li_list_mtx); goto exit_thread; } - list_for_each_safe(pos, n, &eli->li_request_list) { + int err = 0; + int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) { - if (ext4_run_li_request(elr) != 0) { - /* error, remove the lazy_init job */ - ext4_remove_li_request(elr); - continue; + if (time_before(jiffies, elr->lr_next_sched)) { + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + continue; + } + if (down_read_trylock(&elr->lr_super->s_umount)) { + if (sb_start_write_trylock(elr->lr_super)) { + progress = 1; + /* + * We hold sb->s_umount, sb can not + * be removed from the list, it is + * now safe to drop li_list_mtx + */ + mutex_unlock(&eli->li_list_mtx); + err = ext4_run_li_request(elr); + sb_end_write(elr->lr_super); + mutex_lock(&eli->li_list_mtx); + n = pos->next; } + up_read((&elr->lr_super->s_umount)); + } + /* error, remove the lazy_init job */ + if (err) { + ext4_remove_li_request(elr); + continue; + } + if (!progress) { + elr->lr_next_sched = jiffies + + (prandom_u32() + % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); } - if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; } -- cgit 1.4.1 From 4e800c0359d9a53e6bf0ab216954971b2515247f Mon Sep 17 00:00:00 2001 From: wangguang Date: Thu, 15 Sep 2016 11:32:46 -0400 Subject: ext4: bugfix for mmaped pages in mpage_release_unused_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pages clear buffers after ext4 delayed block allocation failed, However, it does not clean its pte_dirty flag. if the pages unmap ,in cording to the pte_dirty , unmap_page_range may try to call __set_page_dirty, which may lead to the bugon at mpage_prepare_extent_to_map:head = page_buffers(page);. This patch just call clear_page_dirty_for_io to clean pte_dirty at mpage_release_unused_pages for pages mmaped. Steps to reproduce the bug: (1) mmap a file in ext4 addr = (char *)mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); memset(addr, 'i', 4096); (2) return EIO at ext4_writepages->mpage_map_and_submit_extent->mpage_map_one_extent which causes this log message to be print: ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); (3)Unmap the addr cause warning at __set_page_dirty:WARN_ON_ONCE(warn && !PageUptodate(page)); (4) wait for a minute,then bugon happen. Cc: stable@vger.kernel.org Signed-off-by: wangguang Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f058afbc4c46..9b464e5272bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1649,6 +1649,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_SIZE); ClearPageUptodate(page); } -- cgit 1.4.1 From edf15aa180d7b98fe16bd3eda42f9dd0e60dee20 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:39:52 -0400 Subject: ext4: fix memory leak in ext4_insert_range() Running xfstests generic/013 with kmemleak gives the following: unreferenced object 0xffff8801d3d27de0 (size 96): comm "fsstress", pid 4941, jiffies 4294860168 (age 53.485s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 01 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x23/0x40 [] __kmalloc+0xf5/0x1d0 [] ext4_find_extent+0x1ec/0x2f0 [] ext4_insert_range+0x34c/0x4a0 [] ext4_fallocate+0x4e2/0x8b0 [] vfs_fallocate+0x134/0x210 [] SyS_fallocate+0x3f/0x60 [] entry_SYSCALL_64_fastpath+0x13/0x8f [] 0xffffffffffffffff Problem seems mitigated by dropping refs and freeing path when there's no path[depth].p_ext Cc: stable@vger.kernel.org Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d7ccb7f51dfc..7f69347bd5a5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5734,6 +5734,9 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + } else { + ext4_ext_drop_refs(path); + kfree(path); } ret = ext4_es_remove_extent(inode, offset_lblk, -- cgit 1.4.1 From c3fe493ccdb1f443c30155150391835004014c6a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:52:07 -0400 Subject: ext4: remove unneeded test in ext4_alloc_file_blocks() ext4_alloc_file_blocks() is called from ext4_zero_range() and ext4_fallocate() both already testing EXT4_INODE_EXTENTS We can call ext_depth(inode) unconditionnally. [ Added BUG_ON check to make sure ext4_alloc_file_blocks() won't get called for a indirect-mapped inode in the future. -- tytso ] Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7f69347bd5a5..ef288558b869 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4679,6 +4679,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, unsigned int credits; loff_t epos; + BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* @@ -4693,13 +4694,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); - /* - * We can only call ext_depth() on extent based inodes - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - depth = ext_depth(inode); - else - depth = -1; + depth = ext_depth(inode); retry: while (ret >= 0 && len) { -- cgit 1.4.1 From 518eaa6387df796d54e065e04502e800cae1df80 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:55:01 -0400 Subject: ext4: create EXT4_MAX_BLOCKS() macro Create a macro to calculate length + offset -> maximum blocks This adds more readability. Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +++ fs/ext4/extents.c | 15 +++------------ fs/ext4/file.c | 3 +-- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 962a0444dc84..282a51b07c57 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -262,6 +262,9 @@ struct ext4_io_submit { (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) +#define EXT4_MAX_BLOCKS(size, offset, blkbits) \ + ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ + blkbits)) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ef288558b869..c930a0110fb4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4961,13 +4961,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - lblk; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -5030,12 +5025,8 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, unsigned int credits, blkbits = inode->i_blkbits; map.m_lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk); + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 261ac3734c58..34acda78ce25 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -144,8 +144,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int err, len; map.m_lblk = pos >> blkbits; - map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) - - map.m_lblk; + map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); len = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); -- cgit 1.4.1 From be32197cd6b8c37425ca954e719d236e6117a8ee Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 15 Sep 2016 11:58:47 -0400 Subject: ext4: remove unused definition for MAX_32_NUM MAX_32_NUM isn't used in ext4 Signed-off-by: Fabian Frederick Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ef4430b2f04a..c64239b24c2f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,8 +19,6 @@ #include "ext4_jbd2.h" #include "ext4.h" -#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) - /** * Swap memory between @a and @b for @len bytes. * -- cgit 1.4.1 From dcce7a46c6f28f41447272fb44348ead8f584573 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 13:13:13 -0400 Subject: ext4: fix memory leak when symlink decryption fails This bug was introduced in v4.8-rc1. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/symlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 4d83d9e05f2e..04a7850a0d45 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -65,13 +65,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; + paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - paddr = pstr.name; - /* Null-terminate the name */ if (res <= pstr.len) paddr[res] = '\0'; -- cgit 1.4.1 From ef1eb3aa50930f026135085cd160b1212cdfe817 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 15 Sep 2016 17:25:55 -0400 Subject: fscrypto: make filename crypto functions return 0 on success Several filename crypto functions: fname_decrypt(), fscrypt_fname_disk_to_usr(), and fscrypt_fname_usr_to_disk(), returned the output length on success or -errno on failure. However, the output length was redundant with the value written to 'oname->len'. It is also potentially error-prone to make callers have to check for '< 0' instead of '!= 0'. Therefore, make these functions return 0 instead of a length, and make the callers who cared about the return value being a length use 'oname->len' instead. For consistency also make other callers check for a nonzero result rather than a negative result. This change also fixes the inconsistency of fname_encrypt() actually already returning 0 on success, not a length like the other filename crypto functions and as documented in its function comment. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Acked-by: Jaegeuk Kim --- fs/crypto/fname.c | 56 +++++++++++++++++++++++++++++++------------------------ fs/ext4/dir.c | 5 +++-- fs/ext4/namei.c | 8 ++++---- fs/ext4/symlink.c | 5 ++--- fs/f2fs/dir.c | 6 +++--- fs/f2fs/namei.c | 6 +++--- 6 files changed, 47 insertions(+), 39 deletions(-) (limited to 'fs/ext4') diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6bbc3b1859e1..90697c70c23b 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -35,11 +35,11 @@ static void fname_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) @@ -105,20 +105,22 @@ static int fname_encrypt(struct inode *inode, } kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } oname->len = ciphertext_len; - return res; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -168,7 +170,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -279,6 +281,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +293,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +308,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +320,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +341,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +375,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 67415e0e6af0..4d4b91029109 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -260,11 +260,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &de_name, &fstr); + de_name = fstr; fstr.len = save_len; - if (err < 0) + if (err) goto errout; if (!dir_emit(ctx, - fstr.name, err, + de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 34c0142caf6a..2243ae2ad2ee 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -639,7 +639,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_alloc_buffer( dir, len, &fname_crypto_str); - if (res < 0) + if (res) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " @@ -647,7 +647,7 @@ static struct stats dx_show_leaf(struct inode *dir, res = fscrypt_fname_disk_to_usr(dir, 0, 0, &de_name, &fname_crypto_str); - if (res < 0) { + if (res) { printk(KERN_WARNING "Error " "converting filename " "from disk to usr" @@ -1011,7 +1011,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, hinfo->minor_hash, &de_name, &fname_crypto_str); - if (err < 0) { + if (err) { count = err; goto errout; } @@ -3144,7 +3144,7 @@ static int ext4_symlink(struct inode *dir, istr.name = (const unsigned char *) symname; istr.len = len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); disk_link.name = (char *) sd; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 04a7850a0d45..0a26cbd529c8 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -68,12 +68,11 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, paddr = pstr.name; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* Null-terminate the name */ - if (res <= pstr.len) - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; if (cpage) put_page(cpage); set_delayed_call(done, kfree_link, paddr); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9054aeac8015..8716943335b1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -786,7 +786,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); if (!de_name.name) @@ -794,11 +794,11 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, memcpy(de_name.name, d->filename[bit_pos], de_name.len); - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); kfree(de_name.name); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 73fa356f8fbb..afd56332d34c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -449,7 +449,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1048,7 +1048,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1060,7 +1060,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); set_delayed_call(done, kfree_link, paddr); -- cgit 1.4.1 From cca32b7eeb4ea24fa6596650e06279ad9130af98 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 22 Sep 2016 11:49:38 -0400 Subject: ext4: allow DAX writeback for hole punch Currently when doing a DAX hole punch with ext4 we fail to do a writeback. This is because the logic around filemap_write_and_wait_range() in ext4_punch_hole() only looks for dirty page cache pages in the radix tree, not for dirty DAX exceptional entries. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b464e5272bb..39883158b970 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3892,7 +3892,7 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, } /* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode @@ -3921,7 +3921,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Write out all dirty pages to avoid race conditions * Then release them. */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) -- cgit 1.4.1 From 16c54688592ce8eea85d2a26d37b64fa07e3e233 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 01:03:17 -0400 Subject: ext4: Allow parallel DIO reads We can easily support parallel direct IO reads. We only have to make sure we cannot expose uninitialized data by reading allocated block to which data was not written yet, or which was already truncated. That is easily achieved by holding inode_lock in shared mode - that excludes all writes, truncates, hole punches. We also have to guard against page writeback allocating blocks for delay-allocated pages - that race is handled by the fact that we writeback all the pages in the affected range and the lock protects us from new pages being created there. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 39883158b970..d8a4afc5eedb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3528,35 +3528,31 @@ out: static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { - int unlocked = 0; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; ssize_t ret; - if (ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) - inode_dio_end(inode); - else - unlocked = 1; - } + /* + * Shared inode_lock is enough for us - it protects against concurrent + * writes & truncates and since we take care of writing back page cache, + * we are protected against page writeback as well. + */ + inode_lock_shared(inode); if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, - NULL, unlocked ? 0 : DIO_LOCKING); + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); } else { + size_t count = iov_iter_count(iter); + + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, ext4_dio_get_block, - NULL, NULL, - unlocked ? 0 : DIO_LOCKING); + NULL, NULL, 0); } - if (unlocked) - inode_dio_end(inode); +out_unlock: + inode_unlock_shared(inode); return ret; } -- cgit 1.4.1 From e81d44778d1d57bbaef9e24c4eac7c8a7a401d40 Mon Sep 17 00:00:00 2001 From: gmail Date: Fri, 30 Sep 2016 01:33:37 -0400 Subject: ext4: release bh in make_indexed_dir The commit 6050d47adcad: "ext4: bail out from make_indexed_dir() on first error" could end up leaking bh2 in the error path. [ Also avoid renaming bh2 to bh, which just confuses things --tytso ] Cc: stable@vger.kernel.org Signed-off-by: yangsheng Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2243ae2ad2ee..c344b819cffa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2044,33 +2044,31 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, frame->entries = entries; frame->at = entries; frame->bh = bh; - bh = bh2; retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) goto out_frames; - retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &fname->hinfo); + de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } - dx_release(frames); - retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); - brelse(bh); - return retval; + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2); out_frames: /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up * with corrupted filesystem. */ - ext4_mark_inode_dirty(handle, dir); + if (retval) + ext4_mark_inode_dirty(handle, dir); dx_release(frames); + brelse(bh2); return retval; } -- cgit 1.4.1 From cc91542ac8f29d8446ef59234796b95f0fbce09a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 30 Sep 2016 01:44:17 -0400 Subject: ext4: do not unnecessarily null-terminate encrypted symlink data Null-terminating the fscrypt_symlink_data on read is unnecessary because it is not string data --- it contains binary ciphertext. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o --- fs/ext4/symlink.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 0a26cbd529c8..fdf1c6154745 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -30,7 +30,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, char *caddr, *paddr = NULL; struct fscrypt_str cstr, pstr; struct fscrypt_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 max_size = inode->i_sb->s_blocksize; @@ -49,7 +48,6 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, if (IS_ERR(cpage)) return ERR_CAST(cpage); caddr = page_address(cpage); - caddr[size] = 0; } /* Symlink is encrypted */ -- cgit 1.4.1 From 9a200d075e5d05be1fcad4547a0f8aee4e2f9a04 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 30 Sep 2016 01:49:55 -0400 Subject: ext4: require encryption feature for EXT4_IOC_SET_ENCRYPTION_POLICY ...otherwise an user can enable encryption for certain files even when the filesystem is unable to support it. Such a case would be a filesystem created by mkfs.ext4's default settings, 1KiB block size. Ext4 supports encyption only when block size is equal to PAGE_SIZE. But this constraint is only checked when the encryption feature flag is set. Signed-off-by: Richard Weinberger Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c64239b24c2f..22abf7aeb25d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -769,6 +769,9 @@ resizefs_out: #ifdef CONFIG_EXT4_FS_ENCRYPTION struct fscrypt_policy policy; + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) -- cgit 1.4.1 From 4b0524aae0082272737c97d2b160d55d6e8f0b2b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 01:55:32 -0400 Subject: ext4: allow unlocked direct IO when pages are cached Currently we do not allow unlocked (meaning without inode_lock) direct IO when the file has any pages cached. This check is not needed anymore as we keep inode lock until ext4_direct_IO_write() and thus can happily writeback and evict any pages conflicting with current direct IO write. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 34acda78ce25..25342c8dd5d6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -91,7 +91,6 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; @@ -138,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { + pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, len; -- cgit 1.4.1 From 51e8137b82622d8ea22e993d613db568f11c1523 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 01:57:41 -0400 Subject: ext4: remove plugging from ext4_file_write_iter() do_blockdev_direct_IO() takes care of properly plugging direct IO so there's no need to plug again inside ext4_file_write_iter(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 25342c8dd5d6..25f763f2201a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -92,7 +92,6 @@ static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; int unaligned_aio = 0; int overwrite = 0; @@ -133,7 +132,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) { size_t length = iov_iter_count(from); loff_t pos = iocb->ki_pos; - blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !unaligned_aio && @@ -169,8 +167,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) ret = generic_write_sync(iocb, ret); - if (o_direct) - blk_finish_plug(&plug); return ret; -- cgit 1.4.1 From 9b623df614576680cadeaa4d7e0b5884de8f7c17 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 30 Sep 2016 02:02:29 -0400 Subject: ext4: unmap metadata when zeroing blocks When zeroing blocks for DAX allocations, we also have to unmap aliases in the block device mappings. Otherwise writeback can overwrite zeros with stale data from block device page cache. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8a4afc5eedb..cd918823b352 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -647,11 +647,19 @@ found: /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + ext4_lblk_t i; + + for (i = 0; i < map->m_len; i++) { + unmap_underlying_metadata(inode->i_sb->s_bdev, + map->m_pblk + i); + } ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { -- cgit 1.4.1 From c6cb7e776ad59feba5d84b2524efd6dcbc618e42 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Fri, 30 Sep 2016 02:05:09 -0400 Subject: ext4: create function to read journal inode Factor out the code used in ext4_get_journal() to read a valid journal inode from storage, enabling its reuse in other functions. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 50912cc5fb96..056ca1bea2ac 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -78,6 +78,8 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum); /* * Lock ordering @@ -4237,18 +4239,16 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) write_unlock(&journal->j_state_lock); } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static struct inode *ext4_get_journal_inode(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; - journal_t *journal; - - BUG_ON(!ext4_has_feature_journal(sb)); - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ + /* + * Test for the existence of a valid inode on disk. Bad things + * happen if we iget() an unused inode, as the subsequent iput() + * will try to delete it. + */ journal_inode = ext4_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); @@ -4268,6 +4268,20 @@ static journal_t *ext4_get_journal(struct super_block *sb, iput(journal_inode); return NULL; } + return journal_inode; +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!ext4_has_feature_journal(sb)); + + journal_inode = ext4_get_journal_inode(sb, journal_inum); + if (!journal_inode) + return NULL; journal = jbd2_journal_init_inode(journal_inode); if (!journal) { -- cgit 1.4.1 From 3c816ded78bbd645d61ee36f461deb8376d3723a Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Fri, 30 Sep 2016 02:08:49 -0400 Subject: ext4: use journal inode to determine journal overhead When a file system contains an internal journal that has not been loaded, use the journal inode's i_size field to determine its contribution to the file system's overhead. (The journal's j_maxlen field is normally used to determine its size, but it's unavailable when the journal has not been loaded.) Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 056ca1bea2ac..6db81fbcbaa6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3210,6 +3210,8 @@ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct inode *j_inode; + unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); @@ -3240,10 +3242,23 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the internal journal blocks as well */ + + /* + * Add the internal journal blocks whether the journal has been + * loaded or not + */ if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); - + else if (ext4_has_feature_journal(sb) && !sbi->s_journal) { + j_inode = ext4_get_journal_inode(sb, j_inum); + if (j_inode) { + j_blocks = j_inode->i_size >> sb->s_blocksize_bits; + overhead += EXT4_NUM_B2C(sbi, j_blocks); + iput(j_inode); + } else { + ext4_msg(sb, KERN_ERR, "can't get journal size"); + } + } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); -- cgit 1.4.1 From 18017479cabaeb5c659d789f04ecf7939f8ee28f Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Fri, 30 Sep 2016 02:14:56 -0400 Subject: ext4: remove unused variable Signed-off-by: Eric Engestrom --- fs/ext4/dir.c | 3 +-- fs/ext4/page-io.c | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 4d4b91029109..e8b365000d73 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -628,7 +628,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; - int nlen, rlen; + int rlen; unsigned int offset = 0; char *top; @@ -638,7 +638,6 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; - nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a6132a730967..b4cbee936cf8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -405,14 +405,12 @@ int ext4_bio_write_page(struct ext4_io_submit *io, { struct page *data_page = NULL; struct inode *inode = page->mapping->host; - unsigned block_start, blocksize; + unsigned block_start; struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; int nr_to_submit = 0; - blocksize = 1 << inode->i_blkbits; - BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); -- cgit 1.4.1