summary refs log tree commit diff
path: root/fs/ceph/file.c
diff options
context:
space:
mode:
authorJeff Layton <jlayton@kernel.org>2019-08-02 13:15:39 -0400
committerIlya Dryomov <idryomov@gmail.com>2019-09-16 12:06:25 +0200
commit321fe13c939876b55fbb7780a243c86e577a2151 (patch)
tree9963961399b7648c1207012d31b1dc51f50a389c /fs/ceph/file.c
parent4766815b1179dbe4fe263a5f95795710e29276e2 (diff)
downloadlinux-321fe13c939876b55fbb7780a243c86e577a2151.tar.gz
ceph: add buffered/direct exclusionary locking for reads and writes
xfstest generic/451 intermittently fails. The test does O_DIRECT writes
to a file, and then reads back the result using buffered I/O, while
running a separate set of tasks that are also doing buffered reads.

The client will invalidate the cache prior to a direct write, but it's
easy for one of the other readers' replies to race in and reinstantiate
the invalidated range with stale data.

To fix this, we must to serialize direct I/O writes and buffered reads.
We could just sprinkle in some shared locks on the i_rwsem for reads,
and increase the exclusive footprint on the write side, but that would
cause O_DIRECT writes to end up serialized vs. other direct requests.

Instead, borrow the scheme used by nfs.ko. Buffered writes take the
i_rwsem exclusively, but buffered reads take a shared lock, allowing
them to run in parallel.

O_DIRECT requests also take a shared lock, but we need for them to not
run in parallel with buffered reads.  A flag on the ceph_inode_info is
used to indicate whether it's in direct or buffered I/O mode. When a
conflicting request is submitted, it will block until the inode can be
flipped to the necessary mode.

Link: https://tracker.ceph.com/issues/40985
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r--fs/ceph/file.c35
1 files changed, 22 insertions, 13 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5182e1a49d6f..ff17a81bf2e2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -15,6 +15,7 @@
 #include "super.h"
 #include "mds_client.h"
 #include "cache.h"
+#include "io.h"
 
 static __le32 ceph_flags_sys2wire(u32 flags)
 {
@@ -930,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	struct ceph_aio_request *aio_req = NULL;
 	int num_pages = 0;
 	int flags;
-	int ret;
+	int ret = 0;
 	struct timespec64 mtime = current_time(inode);
 	size_t count = iov_iter_count(iter);
 	loff_t pos = iocb->ki_pos;
@@ -944,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 	     (write ? "write" : "read"), file, pos, (unsigned)count,
 	     snapc, snapc ? snapc->seq : 0);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping,
-					   pos, pos + count - 1);
-	if (ret < 0)
-		return ret;
-
 	if (write) {
 		int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
 					pos >> PAGE_SHIFT,
@@ -1284,12 +1280,16 @@ again:
 
 		if (ci->i_inline_version == CEPH_INLINE_NONE) {
 			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+				ceph_start_io_direct(inode);
 				ret = ceph_direct_read_write(iocb, to,
 							     NULL, NULL);
+				ceph_end_io_direct(inode);
 				if (ret >= 0 && ret < len)
 					retry_op = CHECK_EOF;
 			} else {
+				ceph_start_io_read(inode);
 				ret = ceph_sync_read(iocb, to, &retry_op);
+				ceph_end_io_read(inode);
 			}
 		} else {
 			retry_op = READ_INLINE;
@@ -1300,7 +1300,9 @@ again:
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
 		     ceph_cap_string(got));
 		ceph_add_rw_context(fi, &rw_ctx);
+		ceph_start_io_read(inode);
 		ret = generic_file_read_iter(iocb, to);
+		ceph_end_io_read(inode);
 		ceph_del_rw_context(fi, &rw_ctx);
 	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1409,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return -ENOMEM;
 
 retry_snap:
-	inode_lock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_start_io_direct(inode);
+	else
+		ceph_start_io_write(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1480,7 +1485,6 @@ retry_snap:
 	    (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
 		struct ceph_snap_context *snapc;
 		struct iov_iter data;
-		inode_unlock(inode);
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_have_pending_cap_snap(ci)) {
@@ -1497,11 +1501,14 @@ retry_snap:
 
 		/* we might need to revert back to that point */
 		data = *from;
-		if (iocb->ki_flags & IOCB_DIRECT)
+		if (iocb->ki_flags & IOCB_DIRECT) {
 			written = ceph_direct_read_write(iocb, &data, snapc,
 							 &prealloc_cf);
-		else
+			ceph_end_io_direct(inode);
+		} else {
 			written = ceph_sync_write(iocb, &data, pos, snapc);
+			ceph_end_io_write(inode);
+		}
 		if (written > 0)
 			iov_iter_advance(from, written);
 		ceph_put_snap_context(snapc);
@@ -1516,7 +1523,7 @@ retry_snap:
 		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
-		inode_unlock(inode);
+		ceph_end_io_write(inode);
 	}
 
 	if (written >= 0) {
@@ -1551,9 +1558,11 @@ retry_snap:
 	}
 
 	goto out_unlocked;
-
 out:
-	inode_unlock(inode);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		ceph_end_io_direct(inode);
+	else
+		ceph_end_io_write(inode);
 out_unlocked:
 	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;