summary refs log tree commit diff
path: root/drivers/md
diff options
context:
space:
mode:
authorKent Overstreet <kmo@daterainc.com>2013-12-17 23:49:49 -0800
committerKent Overstreet <kmo@daterainc.com>2014-01-08 13:05:12 -0800
commitee811287c9f241641899788cbfc9d70ed96ba3a5 (patch)
tree91995b415b61e17bc5cc0796e2a7d933c1ca4e60 /drivers/md
parent67539e85289c14a76a1c4162613d14a5f05a0027 (diff)
downloadlinux-ee811287c9f241641899788cbfc9d70ed96ba3a5.tar.gz
bcache: Rename/shuffle various code around
More work to disentangle bset.c from the rest of the code:

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h8
-rw-r--r--drivers/md/bcache/bset.c172
-rw-r--r--drivers/md/bcache/bset.h247
-rw-r--r--drivers/md/bcache/btree.c167
-rw-r--r--drivers/md/bcache/btree.h9
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/journal.c9
-rw-r--r--drivers/md/bcache/super.c2
8 files changed, 341 insertions, 276 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 7bd4c93475e7..5c74d55cea7f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -717,14 +717,6 @@ struct bbio {
 #define bucket_bytes(c)		((c)->sb.bucket_size << 9)
 #define block_bytes(c)		((c)->sb.block_size << 9)
 
-#define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i)		__set_bytes(i, i->keys)
-
-#define __set_blocks(i, k, c)	DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c)	__set_blocks(i, (i)->keys, c)
-
-#define btree_data_space(b)	(PAGE_SIZE << (b)->page_order)
-
 #define prios_per_bucket(c)				\
 	((bucket_bytes(c) - sizeof(struct prio_set)) /	\
 	 sizeof(struct bucket_disk))
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 9d9c2edda760..e04e5908e29f 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -302,6 +302,115 @@ bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
 	return true;
 }
 
+/* Auxiliary search trees */
+
+/* 32 bits total: */
+#define BKEY_MID_BITS		3
+#define BKEY_EXPONENT_BITS	7
+#define BKEY_MANTISSA_BITS	(32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+	unsigned	exponent:BKEY_EXPONENT_BITS;
+	unsigned	m:BKEY_MID_BITS;
+	unsigned	mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree *b)
+{
+	return PAGE_SIZE << b->page_order;
+}
+
+static inline size_t btree_keys_cachelines(struct btree *b)
+{
+	return btree_keys_bytes(b) / BSET_CACHELINE;
+}
+
+/* Space required for the auxiliary search trees */
+static inline size_t bset_tree_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
+
+/* Space required for the prev pointers */
+static inline size_t bset_prev_bytes(struct btree *b)
+{
+	return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
+
+/* Memory allocation */
+
+void bch_btree_keys_free(struct btree *b)
+{
+	struct bset_tree *t = b->sets;
+
+	if (bset_prev_bytes(b) < PAGE_SIZE)
+		kfree(t->prev);
+	else
+		free_pages((unsigned long) t->prev,
+			   get_order(bset_prev_bytes(b)));
+
+	if (bset_tree_bytes(b) < PAGE_SIZE)
+		kfree(t->tree);
+	else
+		free_pages((unsigned long) t->tree,
+			   get_order(bset_tree_bytes(b)));
+
+	free_pages((unsigned long) t->data, b->page_order);
+
+	t->prev = NULL;
+	t->tree = NULL;
+	t->data = NULL;
+}
+
+int bch_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
+{
+	struct bset_tree *t = b->sets;
+
+	BUG_ON(t->data);
+
+	b->page_order = page_order;
+
+	t->data = (void *) __get_free_pages(gfp, b->page_order);
+	if (!t->data)
+		goto err;
+
+	t->tree = bset_tree_bytes(b) < PAGE_SIZE
+		? kmalloc(bset_tree_bytes(b), gfp)
+		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+	if (!t->tree)
+		goto err;
+
+	t->prev = bset_prev_bytes(b) < PAGE_SIZE
+		? kmalloc(bset_prev_bytes(b), gfp)
+		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+	if (!t->prev)
+		goto err;
+
+	return 0;
+err:
+	bch_btree_keys_free(b);
+	return -ENOMEM;
+}
+
 /* Binary tree stuff for auxiliary search trees */
 
 static unsigned inorder_next(unsigned j, unsigned size)
@@ -538,21 +647,36 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
 		t++->size = 0;
 }
 
-static void bset_build_unwritten_tree(struct btree *b)
+static void bch_bset_build_unwritten_tree(struct btree *b)
 {
-	struct bset_tree *t = b->sets + b->nsets;
+	struct bset_tree *t = bset_tree_last(b);
 
 	bset_alloc_tree(b, t);
 
-	if (t->tree != b->sets->tree + bset_tree_space(b)) {
+	if (t->tree != b->sets->tree + btree_keys_cachelines(b)) {
 		t->prev[0] = bkey_to_cacheline_offset(t->data->start);
 		t->size = 1;
 	}
 }
 
+void bch_bset_init_next(struct btree *b, struct bset *i, uint64_t magic)
+{
+	if (i != b->sets->data) {
+		b->sets[++b->nsets].data = i;
+		i->seq = b->sets->data->seq;
+	} else
+		get_random_bytes(&i->seq, sizeof(uint64_t));
+
+	i->magic	= magic;
+	i->version	= 0;
+	i->keys		= 0;
+
+	bch_bset_build_unwritten_tree(b);
+}
+
 static void bset_build_written_tree(struct btree *b)
 {
-	struct bset_tree *t = b->sets + b->nsets;
+	struct bset_tree *t = bset_tree_last(b);
 	struct bkey *k = t->data->start;
 	unsigned j, cacheline = 1;
 
@@ -560,7 +684,7 @@ static void bset_build_written_tree(struct btree *b)
 
 	t->size = min_t(unsigned,
 			bkey_to_cacheline(t, bset_bkey_last(t->data)),
-			b->sets->tree + bset_tree_space(b) - t->tree);
+			b->sets->tree + btree_keys_cachelines(b) - t->tree);
 
 	if (t->size < 2) {
 		t->size = 0;
@@ -599,7 +723,7 @@ void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
 	struct bset_tree *t;
 	unsigned inorder, j = 1;
 
-	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+	for (t = b->sets; t <= bset_tree_last(b); t++)
 		if (k < bset_bkey_last(t->data))
 			goto found_set;
 
@@ -639,9 +763,10 @@ fix_right:	do {
 		} while (j < t->size);
 }
 
-void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+static void bch_bset_fix_lookup_table(struct btree *b,
+				      struct bset_tree *t,
+				      struct bkey *k)
 {
-	struct bset_tree *t = &b->sets[b->nsets];
 	unsigned shift = bkey_u64s(k);
 	unsigned j = bkey_to_cacheline(t, k);
 
@@ -673,7 +798,7 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
 		}
 	}
 
-	if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+	if (t->size == b->sets->tree + btree_keys_cachelines(b) - t->tree)
 		return;
 
 	/* Possibly add a new entry to the end of the lookup table */
@@ -687,21 +812,23 @@ void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
 		}
 }
 
-void bch_bset_init_next(struct btree *b)
+void bch_bset_insert(struct btree *b, struct bkey *where,
+		     struct bkey *insert)
 {
-	struct bset *i = write_block(b);
+	struct bset_tree *t = bset_tree_last(b);
 
-	if (i != b->sets[0].data) {
-		b->sets[++b->nsets].data = i;
-		i->seq = b->sets[0].data->seq;
-	} else
-		get_random_bytes(&i->seq, sizeof(uint64_t));
+	BUG_ON(t->data != write_block(b));
+	BUG_ON(bset_byte_offset(b, t->data) +
+	       __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) >
+	       PAGE_SIZE << b->page_order);
 
-	i->magic	= bset_magic(&b->c->sb);
-	i->version	= 0;
-	i->keys		= 0;
+	memmove((uint64_t *) where + bkey_u64s(insert),
+		where,
+		(void *) bset_bkey_last(t->data) - (void *) where);
 
-	bset_build_unwritten_tree(b);
+	t->data->keys += bkey_u64s(insert);
+	bkey_copy(where, insert);
+	bch_bset_fix_lookup_table(b, t, where);
 }
 
 struct bset_search_iter {
@@ -1154,9 +1281,8 @@ void bch_btree_sort_partial(struct btree *b, unsigned start,
 
 	__bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
 
-	BUG_ON(b->sets[b->nsets].data == write_block(b) &&
-	       (b->sets[b->nsets].size || b->nsets));
-
+	BUG_ON(!bset_written(b, bset_tree_last(b)) &&
+	       (bset_tree_last(b)->size || b->nsets));
 
 	if (start) {
 		unsigned i;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 4f60c21c7a38..ab31f3fc8d43 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -144,22 +144,11 @@
  * first key in that range of bytes again.
  */
 
-struct cache_set;
-
-/* Btree key comparison/iteration */
+struct btree;
+struct bkey_float;
 
 #define MAX_BSETS		4U
 
-struct btree_iter {
-	size_t size, used;
-#ifdef CONFIG_BCACHE_DEBUG
-	struct btree *b;
-#endif
-	struct btree_iter_set {
-		struct bkey *k, *end;
-	} data[MAX_BSETS];
-};
-
 struct bset_tree {
 	/*
 	 * We construct a binary tree in an array as if the array
@@ -169,14 +158,14 @@ struct bset_tree {
 	 */
 
 	/* size of the binary tree and prev array */
-	unsigned	size;
+	unsigned		size;
 
 	/* function of size - precalculated for to_inorder() */
-	unsigned	extra;
+	unsigned		extra;
 
 	/* copy of the last key in the set */
-	struct bkey	end;
-	struct bkey_float *tree;
+	struct bkey		end;
+	struct bkey_float	*tree;
 
 	/*
 	 * The nodes in the bset tree point to specific keys - this
@@ -186,12 +175,61 @@ struct bset_tree {
 	 * to keep bkey_float to 4 bytes and prev isn't used in the fast
 	 * path.
 	 */
-	uint8_t		*prev;
+	uint8_t			*prev;
 
 	/* The actual btree node, with pointers to each sorted set */
-	struct bset	*data;
+	struct bset		*data;
 };
 
+#define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)		__set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, block_bytes)				\
+	DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
+#define set_blocks(i, block_bytes)				\
+	__set_blocks(i, (i)->keys, block_bytes)
+
+void bch_btree_keys_free(struct btree *);
+int bch_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+
+void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
+void bch_bset_init_next(struct btree *, struct bset *, uint64_t);
+void bch_bset_insert(struct btree *, struct bkey *, struct bkey *);
+
+/* Btree key iteration */
+
+struct btree_iter {
+	size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree *b;
+#endif
+	struct btree_iter_set {
+		struct bkey *k, *end;
+	} data[MAX_BSETS];
+};
+
+typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
+
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+					struct btree *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *,
+				 struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
+			   const struct bkey *);
+
+/*
+ * Returns the first key that is strictly greater than search
+ */
+static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
+					   const struct bkey *search)
+{
+	return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
 /* Sorting */
 
 struct bset_sort_state {
@@ -219,6 +257,60 @@ static inline void bch_btree_sort(struct btree *b,
 	bch_btree_sort_partial(b, 0, state);
 }
 
+/* Bkey utility code */
+
+#define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys)
+
+static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
+{
+	return bkey_idx(i->start, idx);
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = ZERO_KEY;
+}
+
+static __always_inline int64_t bkey_cmp(const struct bkey *l,
+					const struct bkey *r)
+{
+	return unlikely(KEY_INODE(l) != KEY_INODE(r))
+		? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
+		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
+}
+
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+			      unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, k) > 0);
+	return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+	return __bch_cut_back(where, k);
+}
+
+#define PRECEDING_KEY(_k)					\
+({								\
+	struct bkey *_ret = NULL;				\
+								\
+	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
+		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
+								\
+		if (!_ret->low)					\
+			_ret->high--;				\
+		_ret->low--;					\
+	}							\
+								\
+	_ret;							\
+})
+
 /* Keylists */
 
 struct keylist {
@@ -282,126 +374,15 @@ struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
 int __bch_keylist_realloc(struct keylist *, unsigned);
 
-/* Bkey utility code */
-
-#define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys)
-
-static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx)
-{
-	return bkey_idx(i->start, idx);
-}
-
-static inline void bkey_init(struct bkey *k)
-{
-	*k = ZERO_KEY;
-}
-
-static __always_inline int64_t bkey_cmp(const struct bkey *l,
-					const struct bkey *r)
-{
-	return unlikely(KEY_INODE(l) != KEY_INODE(r))
-		? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
-		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
-}
-
-void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
-			      unsigned);
-bool __bch_cut_front(const struct bkey *, struct bkey *);
-bool __bch_cut_back(const struct bkey *, struct bkey *);
-
-static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
-{
-	BUG_ON(bkey_cmp(where, k) > 0);
-	return __bch_cut_front(where, k);
-}
-
-static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
-{
-	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
-	return __bch_cut_back(where, k);
-}
-
+struct cache_set;
 const char *bch_ptr_status(struct cache_set *, const struct bkey *);
 bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
 bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
+bool bch_btree_ptr_bad(struct btree *, const struct bkey *);
+bool bch_extent_ptr_bad(struct btree *, const struct bkey *);
 
 bool bch_ptr_bad(struct btree *, const struct bkey *);
 
-typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
-
-struct bkey *bch_btree_iter_next(struct btree_iter *);
-struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
-					struct btree *, ptr_filter_fn);
-
-void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
-struct bkey *bch_btree_iter_init(struct btree *, struct btree_iter *,
-				 struct bkey *);
-
-/* 32 bits total: */
-#define BKEY_MID_BITS		3
-#define BKEY_EXPONENT_BITS	7
-#define BKEY_MANTISSA_BITS	22
-#define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
-
-struct bkey_float {
-	unsigned	exponent:BKEY_EXPONENT_BITS;
-	unsigned	m:BKEY_MID_BITS;
-	unsigned	mantissa:BKEY_MANTISSA_BITS;
-} __packed;
-
-/*
- * BSET_CACHELINE was originally intended to match the hardware cacheline size -
- * it used to be 64, but I realized the lookup code would touch slightly less
- * memory if it was 128.
- *
- * It definites the number of bytes (in struct bset) per struct bkey_float in
- * the auxiliar search tree - when we're done searching the bset_float tree we
- * have this many bytes left that we do a linear search over.
- *
- * Since (after level 5) every level of the bset_tree is on a new cacheline,
- * we're touching one fewer cacheline in the bset tree in exchange for one more
- * cacheline in the linear search - but the linear search might stop before it
- * gets to the second cacheline.
- */
-
-#define BSET_CACHELINE		128
-#define bset_tree_space(b)	(btree_data_space(b) / BSET_CACHELINE)
-
-#define bset_tree_bytes(b)	(bset_tree_space(b) * sizeof(struct bkey_float))
-#define bset_prev_bytes(b)	(bset_tree_space(b) * sizeof(uint8_t))
-
-void bch_bset_init_next(struct btree *);
-
-void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
-void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
-
-struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
-			   const struct bkey *);
-
-/*
- * Returns the first key that is strictly greater than search
- */
-static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
-					   const struct bkey *search)
-{
-	return search ? __bch_bset_search(b, t, search) : t->data->start;
-}
-
-#define PRECEDING_KEY(_k)					\
-({								\
-	struct bkey *_ret = NULL;				\
-								\
-	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
-		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
-								\
-		if (!_ret->low)					\
-			_ret->high--;				\
-		_ret->low--;					\
-	}							\
-								\
-	_ret;							\
-})
-
 bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
 
 int bch_bset_print_stats(struct cache_set *, char *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 78ba0b67ac16..89252e7f2879 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -110,7 +110,7 @@ static inline bool should_split(struct btree *b)
 {
 	struct bset *i = write_block(b);
 	return b->written >= btree_blocks(b) ||
-		(b->written + __set_blocks(i, i->keys + 15, b->c)
+		(b->written + __set_blocks(i, i->keys + 15, block_bytes(b->c))
 		 > btree_blocks(b));
 }
 
@@ -206,7 +206,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 void bch_btree_node_read_done(struct btree *b)
 {
 	const char *err = "bad btree header";
-	struct bset *i = b->sets[0].data;
+	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
 	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
@@ -228,7 +228,8 @@ void bch_btree_node_read_done(struct btree *b)
 			goto err;
 
 		err = "bad btree header";
-		if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+		if (b->written + set_blocks(i, block_bytes(b->c)) >
+		    btree_blocks(b))
 			goto err;
 
 		err = "bad magic";
@@ -253,7 +254,7 @@ void bch_btree_node_read_done(struct btree *b)
 
 		bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
 
-		b->written += set_blocks(i, b->c);
+		b->written += set_blocks(i, block_bytes(b->c));
 	}
 
 	err = "corrupted btree";
@@ -272,7 +273,7 @@ void bch_btree_node_read_done(struct btree *b)
 		goto err;
 
 	if (b->written < btree_blocks(b))
-		bch_bset_init_next(b);
+		bch_bset_init_next(b, write_block(b), bset_magic(&b->c->sb));
 out:
 	mempool_free(iter, b->c->fill_iter);
 	return;
@@ -393,7 +394,7 @@ static void btree_node_write_endio(struct bio *bio, int error)
 static void do_btree_node_write(struct btree *b)
 {
 	struct closure *cl = &b->io;
-	struct bset *i = b->sets[b->nsets].data;
+	struct bset *i = btree_bset_last(b);
 	BKEY_PADDED(key) k;
 
 	i->version	= BCACHE_BSET_VERSION;
@@ -405,7 +406,7 @@ static void do_btree_node_write(struct btree *b)
 	b->bio->bi_end_io	= btree_node_write_endio;
 	b->bio->bi_private	= cl;
 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
-	b->bio->bi_iter.bi_size	= set_blocks(i, b->c) * block_bytes(b->c);
+	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
 	bch_bio_map(b->bio, i);
 
 	/*
@@ -424,7 +425,8 @@ static void do_btree_node_write(struct btree *b)
 	 */
 
 	bkey_copy(&k.key, &b->key);
-	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
+		       bset_sector_offset(b, i));
 
 	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
 		int j;
@@ -451,14 +453,14 @@ static void do_btree_node_write(struct btree *b)
 
 void bch_btree_node_write(struct btree *b, struct closure *parent)
 {
-	struct bset *i = b->sets[b->nsets].data;
+	struct bset *i = btree_bset_last(b);
 
 	trace_bcache_btree_write(b);
 
 	BUG_ON(current->bio_list);
 	BUG_ON(b->written >= btree_blocks(b));
 	BUG_ON(b->written && !i->keys);
-	BUG_ON(b->sets->data->seq != i->seq);
+	BUG_ON(btree_bset_first(b)->seq != i->seq);
 	bch_check_keys(b, "writing");
 
 	cancel_delayed_work(&b->work);
@@ -472,8 +474,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 
 	do_btree_node_write(b);
 
-	b->written += set_blocks(i, b->c);
-	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
+	b->written += set_blocks(i, block_bytes(b->c));
+	atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 
 	/* If not a leaf node, always sort */
@@ -490,7 +492,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 		bch_btree_verify(b);
 
 	if (b->written < btree_blocks(b))
-		bch_bset_init_next(b);
+		bch_bset_init_next(b, write_block(b), bset_magic(&b->c->sb));
 }
 
 static void bch_btree_node_write_sync(struct btree *b)
@@ -515,7 +517,7 @@ static void btree_node_write_work(struct work_struct *w)
 
 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 {
-	struct bset *i = b->sets[b->nsets].data;
+	struct bset *i = btree_bset_last(b);
 	struct btree_write *w = btree_current_write(b);
 
 	BUG_ON(!b->written);
@@ -575,29 +577,12 @@ static void mca_reinit(struct btree *b)
 
 static void mca_data_free(struct btree *b)
 {
-	struct bset_tree *t = b->sets;
-
 	BUG_ON(b->io_mutex.count != 1);
 
-	if (bset_prev_bytes(b) < PAGE_SIZE)
-		kfree(t->prev);
-	else
-		free_pages((unsigned long) t->prev,
-			   get_order(bset_prev_bytes(b)));
-
-	if (bset_tree_bytes(b) < PAGE_SIZE)
-		kfree(t->tree);
-	else
-		free_pages((unsigned long) t->tree,
-			   get_order(bset_tree_bytes(b)));
-
-	free_pages((unsigned long) t->data, b->page_order);
+	bch_btree_keys_free(b);
 
-	t->prev = NULL;
-	t->tree = NULL;
-	t->data = NULL;
-	list_move(&b->list, &b->c->btree_cache_freed);
 	b->c->bucket_cache_used--;
+	list_move(&b->list, &b->c->btree_cache_freed);
 }
 
 static void mca_bucket_free(struct btree *b)
@@ -616,34 +601,16 @@ static unsigned btree_order(struct bkey *k)
 
 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 {
-	struct bset_tree *t = b->sets;
-	BUG_ON(t->data);
-
-	b->page_order = max_t(unsigned,
-			      ilog2(b->c->btree_pages),
-			      btree_order(k));
-
-	t->data = (void *) __get_free_pages(gfp, b->page_order);
-	if (!t->data)
-		goto err;
-
-	t->tree = bset_tree_bytes(b) < PAGE_SIZE
-		? kmalloc(bset_tree_bytes(b), gfp)
-		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
-	if (!t->tree)
-		goto err;
-
-	t->prev = bset_prev_bytes(b) < PAGE_SIZE
-		? kmalloc(bset_prev_bytes(b), gfp)
-		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
-	if (!t->prev)
-		goto err;
-
-	list_move(&b->list, &b->c->btree_cache);
-	b->c->bucket_cache_used++;
-	return;
-err:
-	mca_data_free(b);
+	if (!bch_btree_keys_alloc(b,
+				  max_t(unsigned,
+					ilog2(b->c->btree_pages),
+					btree_order(k)),
+				  gfp)) {
+		b->c->bucket_cache_used++;
+		list_move(&b->list, &b->c->btree_cache);
+	} else {
+		list_move(&b->list, &b->c->btree_cache_freed);
+	}
 }
 
 static struct btree *mca_bucket_alloc(struct cache_set *c,
@@ -1111,7 +1078,7 @@ retry:
 	}
 
 	b->accessed = 1;
-	bch_bset_init_next(b);
+	bch_bset_init_next(b, b->sets->data, bset_magic(&b->c->sb));
 
 	mutex_unlock(&c->bucket_lock);
 
@@ -1298,7 +1265,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	blocks = btree_default_blocks(b->c) * 2 / 3;
 
 	if (nodes < 2 ||
-	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+	    __set_blocks(b->sets[0].data, keys,
+			 block_bytes(b->c)) > blocks * (nodes - 1))
 		return 0;
 
 	for (i = 0; i < nodes; i++) {
@@ -1308,8 +1276,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	}
 
 	for (i = nodes - 1; i > 0; --i) {
-		struct bset *n1 = new_nodes[i]->sets->data;
-		struct bset *n2 = new_nodes[i - 1]->sets->data;
+		struct bset *n1 = btree_bset_first(new_nodes[i]);
+		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
 		struct bkey *k, *last = NULL;
 
 		keys = 0;
@@ -1319,7 +1287,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			     k < bset_bkey_last(n2);
 			     k = bkey_next(k)) {
 				if (__set_blocks(n1, n1->keys + keys +
-						 bkey_u64s(k), b->c) > blocks)
+						 bkey_u64s(k),
+						 block_bytes(b->c)) > blocks)
 					break;
 
 				last = k;
@@ -1335,7 +1304,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			 * though)
 			 */
 			if (__set_blocks(n1, n1->keys + n2->keys,
-					 b->c) > btree_blocks(new_nodes[i]))
+					 block_bytes(b->c)) >
+			    btree_blocks(new_nodes[i]))
 				goto out_nocoalesce;
 
 			keys = n2->keys;
@@ -1343,8 +1313,8 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			last = &r->b->key;
 		}
 
-		BUG_ON(__set_blocks(n1, n1->keys + keys,
-				    b->c) > btree_blocks(new_nodes[i]));
+		BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
+		       btree_blocks(new_nodes[i]));
 
 		if (last)
 			bkey_copy_key(&new_nodes[i]->key, last);
@@ -1380,7 +1350,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	}
 
 	/* We emptied out this node */
-	BUG_ON(new_nodes[0]->sets->data->keys);
+	BUG_ON(btree_bset_first(new_nodes[0])->keys);
 	btree_node_free(new_nodes[0]);
 	rw_unlock(true, new_nodes[0]);
 
@@ -1831,19 +1801,6 @@ err:
 
 /* Btree insertion */
 
-static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
-{
-	struct bset *i = b->sets[b->nsets].data;
-
-	memmove((uint64_t *) where + bkey_u64s(insert),
-		where,
-		(void *) bset_bkey_last(i) - (void *) where);
-
-	i->keys += bkey_u64s(insert);
-	bkey_copy(where, insert);
-	bch_bset_fix_lookup_table(b, where);
-}
-
 static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
 				    struct btree_iter *iter,
 				    struct bkey *replace_key)
@@ -1944,13 +1901,13 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
 				 * depends on us inserting a new key for the top
 				 * here.
 				 */
-				top = bch_bset_search(b, &b->sets[b->nsets],
+				top = bch_bset_search(b, bset_tree_last(b),
 						      insert);
-				shift_keys(b, top, k);
+				bch_bset_insert(b, top, k);
 			} else {
 				BKEY_PADDED(key) temp;
 				bkey_copy(&temp.key, k);
-				shift_keys(b, k, &temp.key);
+				bch_bset_insert(b, k, &temp.key);
 				top = bkey_next(k);
 			}
 
@@ -1999,7 +1956,7 @@ check_failed:
 static bool btree_insert_key(struct btree *b, struct btree_op *op,
 			     struct bkey *k, struct bkey *replace_key)
 {
-	struct bset *i = b->sets[b->nsets].data;
+	struct bset *i = btree_bset_last(b);
 	struct bkey *m, *prev;
 	unsigned status = BTREE_INSERT_STATUS_INSERT;
 
@@ -2051,10 +2008,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 			goto copy;
 	} else {
 		BUG_ON(replace_key);
-		m = bch_bset_search(b, &b->sets[b->nsets], k);
+		m = bch_bset_search(b, bset_tree_last(b), k);
 	}
 
-insert:	shift_keys(b, m, k);
+insert:	bch_bset_insert(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
 	bch_check_keys(b, "%u for %s", status,
@@ -2079,8 +2036,9 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 		struct bset *i = write_block(b);
 		struct bkey *k = insert_keys->keys;
 
-		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
-		    > btree_blocks(b))
+		if (b->written +
+		    __set_blocks(i, i->keys + bkey_u64s(k),
+				 block_bytes(b->c)) > btree_blocks(b))
 			break;
 
 		if (bkey_cmp(k, &b->key) <= 0) {
@@ -2130,12 +2088,13 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	if (IS_ERR(n1))
 		goto err;
 
-	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+	split = set_blocks(btree_bset_first(n1),
+			   block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
 
 	if (split) {
 		unsigned keys = 0;
 
-		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
+		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
 		n2 = bch_btree_node_alloc(b->c, b->level, true);
 		if (IS_ERR(n2))
@@ -2154,20 +2113,20 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		 * search tree yet
 		 */
 
-		while (keys < (n1->sets[0].data->keys * 3) / 5)
-			keys += bkey_u64s(bset_bkey_idx(n1->sets[0].data,
+		while (keys < (btree_bset_first(n1)->keys * 3) / 5)
+			keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
 							keys));
 
 		bkey_copy_key(&n1->key,
-			      bset_bkey_idx(n1->sets[0].data, keys));
-		keys += bkey_u64s(bset_bkey_idx(n1->sets[0].data, keys));
+			      bset_bkey_idx(btree_bset_first(n1), keys));
+		keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
 
-		n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
-		n1->sets[0].data->keys = keys;
+		btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
+		btree_bset_first(n1)->keys = keys;
 
-		memcpy(n2->sets[0].data->start,
-		       bset_bkey_last(n1->sets[0].data),
-		       n2->sets[0].data->keys * sizeof(uint64_t));
+		memcpy(btree_bset_first(n2)->start,
+		       bset_bkey_last(btree_bset_first(n1)),
+		       btree_bset_first(n2)->keys * sizeof(uint64_t));
 
 		bkey_copy_key(&n2->key, &b->key);
 
@@ -2175,7 +2134,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		bch_btree_node_write(n2, &cl);
 		rw_unlock(true, n2);
 	} else {
-		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
+		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
 
 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 	}
@@ -2256,7 +2215,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				-EINTR;
 		}
 	} else {
-		BUG_ON(write_block(b) != b->sets[b->nsets].data);
+		BUG_ON(write_block(b) != btree_bset_last(b));
 
 		if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
 			if (!b->level)
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 2a5a8481f25e..b3541173ccf2 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -180,9 +180,9 @@ static inline struct btree_write *btree_prev_write(struct btree *b)
 	return b->writes + (btree_node_write_idx(b) ^ 1);
 }
 
-static inline unsigned bset_offset(struct btree *b, struct bset *i)
+static inline struct bset_tree *bset_tree_last(struct btree *b)
 {
-	return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+	return b->sets + b->nsets;
 }
 
 static inline struct bset *btree_bset_first(struct btree *b)
@@ -190,6 +190,11 @@ static inline struct bset *btree_bset_first(struct btree *b)
 	return b->sets->data;
 }
 
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset_tree_last(b)->data;
+}
+
 static inline unsigned bset_byte_offset(struct btree *b, struct bset *i)
 {
 	return ((size_t) i) - ((size_t) b->sets->data);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 955fa1d31774..5a78137c420d 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -123,7 +123,8 @@ static void bch_dump_bucket(struct btree *b)
 	for (i = (start);						\
 	     (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\
 	     i->seq == (start)->seq;					\
-	     i = (void *) i + set_blocks(i, b->c) * block_bytes(b->c))
+	     i = (void *) i + set_blocks(i, block_bytes(b->c)) *	\
+		 block_bytes(b->c))
 
 void bch_btree_verify(struct btree *b)
 {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5e14e3325ec1..18039affc306 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -95,7 +95,7 @@ reread:		left = ca->sb.bucket_size - offset;
 				return ret;
 			}
 
-			blocks = set_blocks(j, ca->set);
+			blocks = set_blocks(j, block_bytes(ca->set));
 
 			while (!list_empty(list)) {
 				i = list_first_entry(list,
@@ -579,7 +579,8 @@ static void journal_write_unlocked(struct closure *cl)
 	struct cache *ca;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
-	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+	unsigned i, sectors = set_blocks(w->data, block_bytes(c)) *
+		c->sb.block_size;
 
 	struct bio *bio;
 	struct bio_list list;
@@ -595,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
 		continue_at(cl, journal_write, system_wq);
 	}
 
-	c->journal.blocks_free -= set_blocks(w->data, c);
+	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
 
 	w->data->btree_level = c->root->level;
 
@@ -685,7 +686,7 @@ static struct journal_write *journal_wait_for_write(struct cache_set *c,
 		struct journal_write *w = c->journal.cur;
 
 		sectors = __set_blocks(w->data, w->data->keys + nkeys,
-				       c) * c->sb.block_size;
+				       block_bytes(c)) * c->sb.block_size;
 
 		if (sectors <= min_t(size_t,
 				     c->journal.blocks_free * c->sb.block_size,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1fc8165a5c01..12057a43e01f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1477,7 +1477,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	c->block_bits		= ilog2(sb->block_size);
 	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);
 
-	c->btree_pages		= c->sb.bucket_size / PAGE_SECTORS;
+	c->btree_pages		= bucket_pages(c);
 	if (c->btree_pages > BTREE_MAX_PAGES)
 		c->btree_pages = max_t(int, c->btree_pages / 4,
 				       BTREE_MAX_PAGES);