From eb14ab8ed24a0405fd056068b28c33a1cd846024 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Feb 2011 12:35:00 -0500
Subject: Btrfs: fix page->private races

There is a race where btrfs_releasepage can drop the
page->private contents just as alloc_extent_buffer is setting
up pages for metadata.  Because of how the Btrfs page flags work,
this results in us skipping the crc on the page during IO.

This patch sovles the race by waiting until after the extent buffer
is inserted into the radix tree before it sets page private.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8862dda46ff6..0418bf2c9757 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1946,6 +1946,7 @@ void set_page_extent_mapped(struct page *page)
 
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+	WARN_ON(!PagePrivate(page));
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
 
@@ -3195,7 +3196,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		}
 		if (!PageUptodate(p))
 			uptodate = 0;
-		unlock_page(p);
+
+		/*
+		 * see below about how we avoid a nasty race with release page
+		 * and why we unlock later
+		 */
+		if (i != 0)
+			unlock_page(p);
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3219,9 +3226,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
 	radix_tree_preload_end();
+
+	/*
+	 * there is a race where release page may have
+	 * tried to find this extent buffer in the radix
+	 * but failed.  It will tell the VM it is safe to
+	 * reclaim the, and it will clear the page private bit.
+	 * We must make sure to set the page private bit properly
+	 * after the extent buffer is in the radix tree so
+	 * it doesn't get lost
+	 */
+	set_page_extent_mapped(eb->first_page);
+	set_page_extent_head(eb->first_page, eb->len);
+	if (!page0)
+		unlock_page(eb->first_page);
 	return eb;
 
 free_eb:
+	if (eb->first_page && !page0)
+		unlock_page(eb->first_page);
+
 	if (!atomic_dec_and_test(&eb->refs))
 		return exists;
 	btrfs_release_extent_buffer(eb);
@@ -3272,10 +3296,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			continue;
 
 		lock_page(page);
+		WARN_ON(!PagePrivate(page));
+
+		set_page_extent_mapped(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
-		else
-			set_page_private(page, EXTENT_PAGE_PRIVATE);
 
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
@@ -3465,6 +3490,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
+
+		WARN_ON(!PagePrivate(page));
+
+		set_page_extent_mapped(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+
 		if (inc_all_pages)
 			page_cache_get(page);
 		if (!PageUptodate(page)) {
-- 
cgit 1.4.1


From e3f24cc521cb7ba60ac137abd1939e4e03435e80 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Feb 2011 12:52:08 -0500
Subject: Btrfs: don't release pages when we can't clear the uptodate bits

Btrfs tracks uptodate state in an rbtree as well as in the
page bits.  This is supposed to enable us to use block sizes other than
the page size, but there are a few parts still missing before that
completely works.

But, our readpage routine trusts this additional range based tracking
of uptodateness, much in the same way the buffer head up to date bits
are trusted for the other filesystems.

The problem is that sometimes we need to allocate memory in order to
split records in the rbtree, even when we are just clearing bits.  This
can be difficult when our clearing function is called GFP_ATOMIC, which
can happen in the releasepage path.

So, what happens today looks like this:

releasepage called with GFP_ATOMIC
btrfs_releasepage calls clear_extent_bit
clear_extent_bit fails to allocate ram, leaving the up to date bit set
btrfs_releasepage returns success

The end result is the page being gone, but btrfs thinking the range is
up to date.   Later on if someone tries to read that same page, the
btrfs readpage code will return immediately thinking the page is already
up to date.

This commit fixes things to fail the releasepage when we can't clear the
extent state bits.  It covers both data pages and metadata tree blocks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/extent_io.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0418bf2c9757..e7aeba242701 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2822,9 +2822,17 @@ int try_release_extent_state(struct extent_map_tree *map,
 		 * at this point we can safely clear everything except the
 		 * locked bit and the nodatasum bit
 		 */
-		clear_extent_bit(tree, start, end,
+		ret = clear_extent_bit(tree, start, end,
 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
 				 0, 0, NULL, mask);
+
+		/* if clear_extent_bit failed for enomem reasons,
+		 * we can't allow the release to continue.
+		 */
+		if (ret < 0)
+			ret = 0;
+		else
+			ret = 1;
 	}
 	return ret;
 }
-- 
cgit 1.4.1