Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block

Pull cgroup writeback support from Jens Axboe: "This is the big pull request for adding cgroup writeback support. This code has been in development for a long time, and it has been simmering in for-next for a good chunk of this cycle too. This is one of those problems that has been talked about for at least half a decade, finally there's a solution and code to go with it. Also see last weeks writeup on LWN: http://lwn.net/Articles/648292/" * 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits) writeback, blkio: add documentation for cgroup writeback support vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB writeback: do foreign inode detection iff cgroup writeback is enabled v9fs: fix error handling in v9fs_session_init() bdi: fix wrong error return value in cgwb_create() buffer: remove unusued 'ret' variable writeback: disassociate inodes from dying bdi_writebacks writeback: implement foreign cgroup inode bdi_writeback switching writeback: add lockdep annotation to inode_to_wb() writeback: use unlocked_inode_to_wb transaction in inode_congested() writeback: implement unlocked_inode_to_wb transaction and use it for stat updates writeback: implement [locked_]inode_to_wb_and_lock_list() writeback: implement foreign cgroup inode detection writeback: make writeback_control track the inode being written back writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use writeback: implement memcg writeback domain based throttling writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes writeback: implement memcg wb_domain writeback: update wb_over_bg_thresh() to use wb_domain aware operations ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-25 16:00:17 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-06-25 16:00:17 -0700
commit: e4bc13adfd016fc1036838170288b5680d1a98b0 (patch)
tree: 8d2cb749397749439732f3a827cb7f2336408337 /include
parent: ad90fb97515b732bc27a0109baa10af636c3c8cd (diff)
parent: 3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff)
download: linux-e4bc13adfd016fc1036838170288b5680d1a98b0.tar.gz
12 files changed, 1617 insertions, 201 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644
index 000000000000..a48d90e3bcbb
--- /dev/null
+++ b/include/linux/backing-dev-defs.h
@@ -0,0 +1,255 @@
+#ifndef __LINUX_BACKING_DEV_DEFS_H
+#define __LINUX_BACKING_DEV_DEFS_H
+
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
+#include <linux/flex_proportions.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+struct page;
+struct device;
+struct dentry;
+
+/*
+ * Bits in bdi_writeback.state
+ */
+enum wb_state {
+	WB_registered,		/* bdi_register() was done */
+	WB_writeback_running,	/* Writeback is in progress */
+	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
+};
+
+enum wb_congested_state {
+	WB_async_congested,	/* The async (write) queue is getting full */
+	WB_sync_congested,	/* The sync queue is getting full */
+};
+
+typedef int (congested_fn)(void *, int);
+
+enum wb_stat_item {
+	WB_RECLAIMABLE,
+	WB_WRITEBACK,
+	WB_DIRTIED,
+	WB_WRITTEN,
+	NR_WB_STAT_ITEMS
+};
+
+#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+/*
+ * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
+ * wb's can operate mostly independently but should share the congested
+ * state.  To facilitate such sharing, the congested state is tracked using
+ * the following struct which is created on demand, indexed by blkcg ID on
+ * its bdi, and refcounted.
+ */
+struct bdi_writeback_congested {
+	unsigned long state;		/* WB_[a]sync_congested flags */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct backing_dev_info *bdi;	/* the associated bdi */
+	atomic_t refcnt;		/* nr of attached wb's and blkg */
+	int blkcg_id;			/* ID of the associated blkcg */
+	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
+#endif
+};
+
+/*
+ * Each wb (bdi_writeback) can perform writeback operations, is measured
+ * and throttled, independently.  Without cgroup writeback, each bdi
+ * (bdi_writeback) is served by its embedded bdi->wb.
+ *
+ * On the default hierarchy, blkcg implicitly enables memcg.  This allows
+ * using memcg's page ownership for attributing writeback IOs, and every
+ * memcg - blkcg combination can be served by its own wb by assigning a
+ * dedicated wb to each memcg, which enables isolation across different
+ * cgroups and propagation of IO back pressure down from the IO layer upto
+ * the tasks which are generating the dirty pages to be written back.
+ *
+ * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
+ * refcounted with the number of inodes attached to it, and pins the memcg
+ * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
+ * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
+ * is tested for blkcg after lookup and removed from index on mismatch so
+ * that a new wb for the combination can be created.
+ */
+struct bdi_writeback {
+	struct backing_dev_info *bdi;	/* our parent bdi */
+
+	unsigned long state;		/* Always use atomic bitops on this */
+	unsigned long last_old_flush;	/* last old data flush */
+
+	struct list_head b_dirty;	/* dirty inodes */
+	struct list_head b_io;		/* parked for writeback */
+	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
+	spinlock_t list_lock;		/* protects the b_* lists */
+
+	struct percpu_counter stat[NR_WB_STAT_ITEMS];
+
+	struct bdi_writeback_congested *congested;
+
+	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long dirtied_stamp;
+	unsigned long written_stamp;	/* pages written at bw_time_stamp */
+	unsigned long write_bandwidth;	/* the estimated write bandwidth */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
+
+	/*
+	 * The base dirty throttle rate, re-calculated on every 200ms.
+	 * All the bdi tasks' dirty rate will be curbed under it.
+	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
+	 * in small steps and is much more smooth/stable than the latter.
+	 */
+	unsigned long dirty_ratelimit;
+	unsigned long balanced_dirty_ratelimit;
+
+	struct fprop_local_percpu completions;
+	int dirty_exceeded;
+
+	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
+	struct list_head work_list;
+	struct delayed_work dwork;	/* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct percpu_ref refcnt;	/* used only for !root wb's */
+	struct fprop_local_percpu memcg_completions;
+	struct cgroup_subsys_state *memcg_css; /* the associated memcg */
+	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
+	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
+	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
+
+	union {
+		struct work_struct release_work;
+		struct rcu_head rcu;
+	};
+#endif
+};
+
+struct backing_dev_info {
+	struct list_head bdi_list;
+	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
+	unsigned int capabilities; /* Device capabilities */
+	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	void *congested_data;	/* Pointer to aux data for congested func */
+
+	char *name;
+
+	unsigned int min_ratio;
+	unsigned int max_ratio, max_prop_frac;
+
+	/*
+	 * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
+	 * any dirty wbs, which is depended upon by bdi_has_dirty().
+	 */
+	atomic_long_t tot_write_bandwidth;
+
+	struct bdi_writeback wb;  /* the root writeback info for this bdi */
+	struct bdi_writeback_congested wb_congested; /* its congested state */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
+	struct rb_root cgwb_congested_tree; /* their congested states */
+	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#endif
+	wait_queue_head_t wb_waitq;
+
+	struct device *dev;
+
+	struct timer_list laptop_mode_wb_timer;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debug_dir;
+	struct dentry *debug_stats;
+#endif
+};
+
+enum {
+	BLK_RW_ASYNC	= 0,
+	BLK_RW_SYNC	= 1,
+};
+
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync);
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync);
+
+static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+	clear_wb_congested(bdi->wb.congested, sync);
+}
+
+static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+	set_wb_congested(bdi->wb.congested, sync);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		return percpu_ref_tryget(&wb->refcnt);
+	return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_put(&wb->refcnt);
+}
+
+/**
+ * wb_dying - is a wb dying?
+ * @wb: bdi_writeback of interest
+ *
+ * Returns whether @wb is unlinked and being drained.
+ */
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return percpu_ref_is_dying(&wb->refcnt);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return false;
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+#endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d87d8eced064..0e6d4828a77a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -8,106 +8,13 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
-#include <linux/percpu_counter.h>
-#include <linux/log2.h>
-#include <linux/flex_proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/timer.h>
+#include <linux/blkdev.h>
 #include <linux/writeback.h>
-#include <linux/atomic.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
-
-struct page;
-struct device;
-struct dentry;
-
-/*
- * Bits in backing_dev_info.state
- */
-enum bdi_state {
-	BDI_async_congested,	/* The async (write) queue is getting full */
-	BDI_sync_congested,	/* The sync queue is getting full */
-	BDI_registered,		/* bdi_register() was done */
-	BDI_writeback_running,	/* Writeback is in progress */
-};
-
-typedef int (congested_fn)(void *, int);
-
-enum bdi_stat_item {
-	BDI_RECLAIMABLE,
-	BDI_WRITEBACK,
-	BDI_DIRTIED,
-	BDI_WRITTEN,
-	NR_BDI_STAT_ITEMS
-};
-
-#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-struct bdi_writeback {
-	struct backing_dev_info *bdi;	/* our parent bdi */
-
-	unsigned long last_old_flush;	/* last old data flush */
-
-	struct delayed_work dwork;	/* work item used for writeback */
-	struct list_head b_dirty;	/* dirty inodes */
-	struct list_head b_io;		/* parked for writeback */
-	struct list_head b_more_io;	/* parked for more writeback */
-	struct list_head b_dirty_time;	/* time stamps are dirty */
-	spinlock_t list_lock;		/* protects the b_* lists */
-};
-
-struct backing_dev_info {
-	struct list_head bdi_list;
-	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
-	unsigned long state;	/* Always use atomic bitops on this */
-	unsigned int capabilities; /* Device capabilities */
-	congested_fn *congested_fn; /* Function pointer if device is md/dm */
-	void *congested_data;	/* Pointer to aux data for congested func */
-
-	char *name;
-
-	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
-
-	unsigned long bw_time_stamp;	/* last time write bw is updated */
-	unsigned long dirtied_stamp;
-	unsigned long written_stamp;	/* pages written at bw_time_stamp */
-	unsigned long write_bandwidth;	/* the estimated write bandwidth */
-	unsigned long avg_write_bandwidth; /* further smoothed write bw */
-
-	/*
-	 * The base dirty throttle rate, re-calculated on every 200ms.
-	 * All the bdi tasks' dirty rate will be curbed under it.
-	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
-	 * in small steps and is much more smooth/stable than the latter.
-	 */
-	unsigned long dirty_ratelimit;
-	unsigned long balanced_dirty_ratelimit;
-
-	struct fprop_local_percpu completions;
-	int dirty_exceeded;
-
-	unsigned int min_ratio;
-	unsigned int max_ratio, max_prop_frac;
-
-	struct bdi_writeback wb;  /* default writeback info for this bdi */
-	spinlock_t wb_lock;	  /* protects work_list & wb.dwork scheduling */
-
-	struct list_head work_list;
-
-	struct device *dev;
-
-	struct timer_list laptop_mode_wb_timer;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debug_dir;
-	struct dentry *debug_stats;
-#endif
-};
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode);
+#include <linux/blk-cgroup.h>
+#include <linux/backing-dev-defs.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
@@ -117,97 +24,99 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-			enum wb_reason reason);
-void bdi_start_background_writeback(struct backing_dev_info *bdi);
-void bdi_writeback_workfn(struct work_struct *work);
-int bdi_has_dirty_io(struct backing_dev_info *bdi);
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+			bool range_cyclic, enum wb_reason reason);
+void wb_start_background_writeback(struct bdi_writeback *wb);
+void wb_workfn(struct work_struct *work);
+void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
 
-static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
-	return !list_empty(&wb->b_dirty) ||
-	       !list_empty(&wb->b_io) ||
-	       !list_empty(&wb->b_more_io);
+	return test_bit(WB_has_dirty_io, &wb->state);
+}
+
+static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+	/*
+	 * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+	 * any dirty wbs.  See wb_update_write_bandwidth().
+	 */
+	return atomic_long_read(&bdi->tot_write_bandwidth);
 }
 
-static inline void __add_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item, s64 amount)
+static inline void __add_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item, s64 amount)
 {
-	__percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+	__percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
-static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void __inc_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item)
 {
-	__add_bdi_stat(bdi, item, 1);
+	__add_wb_stat(wb, item, 1);
 }
 
-static inline void inc_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__inc_bdi_stat(bdi, item);
+	__inc_wb_stat(wb, item);
 	local_irq_restore(flags);
 }
 
-static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void __dec_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item)
 {
-	__add_bdi_stat(bdi, item, -1);
+	__add_wb_stat(wb, item, -1);
 }
 
-static inline void dec_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__dec_bdi_stat(bdi, item);
+	__dec_wb_stat(wb, item);
 	local_irq_restore(flags);
 }
 
-static inline s64 bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+	return percpu_counter_read_positive(&wb->stat[item]);
 }
 
-static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+				enum wb_stat_item item)
 {
-	return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+	return percpu_counter_sum_positive(&wb->stat[item]);
 }
 
-static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	s64 sum;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	sum = __bdi_stat_sum(bdi, item);
+	sum = __wb_stat_sum(wb, item);
 	local_irq_restore(flags);
 
 	return sum;
 }
 
-extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+extern void wb_writeout_inc(struct bdi_writeback *wb);
 
 /*
  * maximal error of a stat counter.
  */
-static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
 {
 #ifdef CONFIG_SMP
-	return nr_cpu_ids * BDI_STAT_BATCH;
+	return nr_cpu_ids * WB_STAT_BATCH;
 #else
 	return 1;
 #endif
@@ -231,50 +140,57 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
+ *
+ * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
 #define BDI_CAP_NO_ACCT_WB	0x00000004
 #define BDI_CAP_STABLE_WRITES	0x00000008
 #define BDI_CAP_STRICTLIMIT	0x00000010
+#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
 extern struct backing_dev_info noop_backing_dev_info;
 
-int writeback_in_progress(struct backing_dev_info *bdi);
-
-static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @wb: bdi_writeback of interest
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * bdi_writeback.
+ */
+static inline bool writeback_in_progress(struct bdi_writeback *wb)
 {
-	if (bdi->congested_fn)
-		return bdi->congested_fn(bdi->congested_data, bdi_bits);
-	return (bdi->state & bdi_bits);
+	return test_bit(WB_writeback_running, &wb->state);
 }
 
-static inline int bdi_read_congested(struct backing_dev_info *bdi)
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-	return bdi_congested(bdi, 1 << BDI_sync_congested);
-}
+	struct super_block *sb;
 
-static inline int bdi_write_congested(struct backing_dev_info *bdi)
-{
-	return bdi_congested(bdi, 1 << BDI_async_congested);
+	if (!inode)
+		return &noop_backing_dev_info;
+
+	sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+	if (sb_is_blkdev_sb(sb))
+		return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
+	return sb->s_bdi;
 }
 
-static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 {
-	return bdi_congested(bdi, (1 << BDI_sync_congested) |
-				  (1 << BDI_async_congested));
-}
+	struct backing_dev_info *bdi = wb->bdi;
 
-enum {
-	BLK_RW_ASYNC	= 0,
-	BLK_RW_SYNC	= 1,
-};
+	if (bdi->congested_fn)
+		return bdi->congested_fn(bdi->congested_data, cong_bits);
+	return wb->congested->state & cong_bits;
+}
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct zone *zone, int sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
@@ -318,4 +234,333 @@ static inline int bdi_sched_wait(void *word)
 	return 0;
 }
 
-#endif		/* _LINUX_BACKING_DEV_H */
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+void wb_congested_put(struct bdi_writeback_congested *congested);
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+				    struct cgroup_subsys_state *memcg_css,
+				    gfp_t gfp);
+void wb_memcg_offline(struct mem_cgroup *memcg);
+void wb_blkcg_offline(struct blkcg *blkcg);
+int inode_congested(struct inode *inode, int cong_bits);
+
+/**
+ * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+ * @inode: inode of interest
+ *
+ * cgroup writeback requires support from both the bdi and filesystem.
+ * Test whether @inode has both.
+ */
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+	return bdi_cap_account_dirty(bdi) &&
+		(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+		(inode->i_sb->s_iflags & SB_I_CGROUPWB);
+}
+
+/**
+ * wb_find_current - find wb for %current on a bdi
+ * @bdi: bdi of interest
+ *
+ * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+ * Must be called under rcu_read_lock() which protects the returend wb.
+ * NULL if not found.
+ */
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+	struct cgroup_subsys_state *memcg_css;
+	struct bdi_writeback *wb;
+
+	memcg_css = task_css(current, memory_cgrp_id);
+	if (!memcg_css->parent)
+		return &bdi->wb;
+
+	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+
+	/*
+	 * %current's blkcg equals the effective blkcg of its memcg.  No
+	 * need to use the relatively expensive cgroup_get_e_css().
+	 */
+	if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+		return wb;
+	return NULL;
+}
+
+/**
+ * wb_get_create_current - get or create wb for %current on a bdi
+ * @bdi: bdi of interest
+ * @gfp: allocation mask
+ *
+ * Equivalent to wb_get_create() on %current's memcg.  This function is
+ * called from a relatively hot path and optimizes the common cases using
+ * wb_find_current().
+ */
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+	struct bdi_writeback *wb;
+
+	rcu_read_lock();
+	wb = wb_find_current(bdi);
+	if (wb && unlikely(!wb_tryget(wb)))
+		wb = NULL;
+	rcu_read_unlock();
+
+	if (unlikely(!wb)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		memcg_css = task_get_css(current, memory_cgrp_id);
+		wb = wb_get_create(bdi, memcg_css, gfp);
+		css_put(memcg_css);
+	}
+	return wb;
+}
+
+/**
+ * inode_to_wb_is_valid - test whether an inode has a wb associated
+ * @inode: inode of interest
+ *
+ * Returns %true if @inode has a wb associated.  May be called without any
+ * locking.
+ */
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return inode->i_wb;
+}
+
+/**
+ * inode_to_wb - determine the wb of an inode
+ * @inode: inode of interest
+ *
+ * Returns the wb @inode is currently associated with.  The caller must be
+ * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * associated wb's list_lock.
+ */
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(debug_locks &&
+		     (!lockdep_is_held(&inode->i_lock) &&
+		      !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+		      !lockdep_is_held(&inode->i_wb->list_lock)));
+#endif
+	return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+ * @inode: target inode
+ * @lockedp: temp bool output param, to be passed to the end function
+ *
+ * The caller wants to access the wb associated with @inode but isn't
+ * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+ * function determines the wb associated with @inode and ensures that the
+ * association doesn't change until the transaction is finished with
+ * unlocked_inode_to_wb_end().
+ *
+ * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+ * afterwards and can't sleep during transaction.  IRQ may or may not be
+ * disabled on return.
+ */
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	rcu_read_lock();
+
+	/*
+	 * Paired with store_release in inode_switch_wb_work_fn() and
+	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+	 */
+	*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+
+	if (unlikely(*lockedp))
+		spin_lock_irq(&inode->i_mapping->tree_lock);
+
+	/*
+	 * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+	 * inode_to_wb() will bark.  Deref directly.
+	 */
+	return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_end - end inode wb access transaction
+ * @inode: target inode
+ * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ */
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+	if (unlikely(locked))
+		spin_unlock_irq(&inode->i_mapping->tree_lock);
+
+	rcu_read_unlock();
+}
+
+struct wb_iter {
+	int			start_blkcg_id;
+	struct radix_tree_iter	tree_iter;
+	void			**slot;
+};
+
+static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+						   struct backing_dev_info *bdi)
+{
+	struct radix_tree_iter *titer = &iter->tree_iter;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (iter->start_blkcg_id >= 0) {
+		iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+		iter->start_blkcg_id = -1;
+	} else {
+		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+	}
+
+	if (!iter->slot)
+		iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+	if (iter->slot)
+		return *iter->slot;
+	return NULL;
+}
+
+static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+						   struct backing_dev_info *bdi,
+						   int start_blkcg_id)
+{
+	iter->start_blkcg_id = start_blkcg_id;
+
+	if (start_blkcg_id)
+		return __wb_iter_next(iter, bdi);
+	else
+		return &bdi->wb;
+}
+
+/**
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * @wb_cur: cursor struct bdi_writeback pointer
+ * @bdi: bdi to walk wb's of
+ * @iter: pointer to struct wb_iter to be used as iteration buffer
+ * @start_blkcg_id: blkcg ID to start iteration from
+ *
+ * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+ * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * to be used as temp storage during iteration.  rcu_read_lock() must be
+ * held throughout iteration.
+ */
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
+	for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);	\
+	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+	return false;
+}
+
+static inline struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+	return bdi->wb.congested;
+}
+
+static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+}
+
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+	return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+	return &bdi->wb;
+}
+
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return true;
+}
+
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+	return &inode_to_bdi(inode)->wb;
+}
+
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	return inode_to_wb(inode);
+}
+
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+}
+
+static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+}
+
+static inline void wb_blkcg_offline(struct blkcg *blkcg)
+{
+}
+
+struct wb_iter {
+	int		next_id;
+};
+
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
+	for ((iter)->next_id = (start_blkcg_id);			\
+	     ({	(wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+
+static inline int inode_congested(struct inode *inode, int cong_bits)
+{
+	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline int inode_read_congested(struct inode *inode)
+{
+	return inode_congested(inode, 1 << WB_sync_congested);
+}
+
+static inline int inode_write_congested(struct inode *inode)
+{
+	return inode_congested(inode, 1 << WB_async_congested);
+}
+
+static inline int inode_rw_congested(struct inode *inode)
+{
+	return inode_congested(inode, (1 << WB_sync_congested) |
+				      (1 << WB_async_congested));
+}
+
+static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+{
+	return wb_congested(&bdi->wb, cong_bits);
+}
+
+static inline int bdi_read_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, 1 << WB_sync_congested);
+}
+
+static inline int bdi_write_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, 1 << WB_async_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, (1 << WB_sync_congested) |
+				  (1 << WB_async_congested));
+}
+
+#endif	/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f0291cf64cc5..5e963a6d7c14 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #ifdef CONFIG_BLK_CGROUP
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 #else	/* CONFIG_BLK_CGROUP */
+static inline int bio_associate_blkcg(struct bio *bio,
+			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif	/* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
new file mode 100644
index 000000000000..58cfab80dd70
--- /dev/null
+++ b/include/linux/blk-cgroup.h
@@ -0,0 +1,655 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/seq_file.h>
+#include <linux/radix-tree.h>
+#include <linux/blkdev.h>
+#include <linux/atomic.h>
+
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX		UINT_MAX
+
+#ifdef CONFIG_BLK_CGROUP
+
+enum blkg_rwstat_type {
+	BLKG_RWSTAT_READ,
+	BLKG_RWSTAT_WRITE,
+	BLKG_RWSTAT_SYNC,
+	BLKG_RWSTAT_ASYNC,
+
+	BLKG_RWSTAT_NR,
+	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+struct blkcg_gq;
+
+struct blkcg {
+	struct cgroup_subsys_state	css;
+	spinlock_t			lock;
+
+	struct radix_tree_root		blkg_tree;
+	struct blkcg_gq			*blkg_hint;
+	struct hlist_head		blkg_list;
+
+	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head		cgwb_list;
+#endif
+};
+
+struct blkg_stat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt;
+};
+
+struct blkg_rwstat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt[BLKG_RWSTAT_NR];
+};
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each has its private
+ * data on each blkg, the size of which is determined by
+ * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+ * together with blkg and invokes pd_init/exit_fn() methods.
+ *
+ * Such private data must embed struct blkg_policy_data (pd) at the
+ * beginning and pd_size can't be smaller than pd.
+ */
+struct blkg_policy_data {
+	/* the blkg and policy id this per-policy data belongs to */
+	struct blkcg_gq			*blkg;
+	int				plid;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent
+ * from any request_queue associated to it must specify its size
+ * with the cpd_size field of the blkcg_policy structure and
+ * embed a blkcg_policy_data in it. blkcg core allocates
+ * policy-specific per-blkcg structures lazily the first time
+ * they are actually needed, so it handles them together with
+ * blkgs. cpd_init() is invoked to let each policy handle
+ * per-blkcg data.
+ */
+struct blkcg_policy_data {
+	/* the policy id this per-policy data belongs to */
+	int				plid;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
+};
+
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
+	/* Pointer to the associated request_queue */
+	struct request_queue		*q;
+	struct list_head		q_node;
+	struct hlist_node		blkcg_node;
+	struct blkcg			*blkcg;
+
+	/*
+	 * Each blkg gets congested separately and the congestion state is
+	 * propagated to the matching bdi_writeback_congested.
+	 */
+	struct bdi_writeback_congested	*wb_congested;
+
+	/* all non-root blkcg_gq's are guaranteed to have access to parent */
+	struct blkcg_gq			*parent;
+
+	/* request allocation list for this blkcg-q pair */
+	struct request_list		rl;
+
+	/* reference count */
+	atomic_t			refcnt;
+
+	/* is this blkg online? protected by both blkcg and q locks */
+	bool				online;
+
+	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
+
+	struct rcu_head			rcu_head;
+};
+
+typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+
+struct blkcg_policy {
+	int				plid;
+	/* policy specific private data size */
+	size_t				pd_size;
+	/* policy specific per-blkcg data size */
+	size_t				cpd_size;
+	/* cgroup files for the policy */
+	struct cftype			*cftypes;
+
+	/* operations */
+	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_online_pd_fn		*pd_online_fn;
+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
+	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern struct cgroup_subsys_state * const blkcg_root_css;
+
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_drain_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol);
+
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat);
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off);
+
+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+					     int off);
+
+struct blkg_conf_ctx {
+	struct gendisk			*disk;
+	struct blkcg_gq			*blkg;
+	u64				v;
+};
+
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct blkcg, css) : NULL;
+}
+
+static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+{
+	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+}
+
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+	if (bio && bio->bi_css)
+		return css_to_blkcg(bio->bi_css);
+	return task_blkcg(current);
+}
+
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+	return task_get_css(task, blkio_cgrp_id);
+}
+
+/**
+ * blkcg_parent - get the parent of a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * Return the parent blkcg of @blkcg.  Can be called anytime.
+ */
+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+{
+	return css_to_blkcg(blkcg->css.parent);
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
+{
+	return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+						     struct blkcg_policy *pol)
+{
+	return blkcg ? blkcg->pd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+	return pd ? pd->blkg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+	char *p;
+
+	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+	if (!p) {
+		strncpy(buf, "<unavailable>", buflen);
+		return -ENAMETOOLONG;
+	}
+
+	memmove(buf, p, buf + buflen - p);
+	return 0;
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+	atomic_inc(&blkg->refcnt);
+}
+
+void __blkg_release_rcu(struct rcu_head *rcu);
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+	if (atomic_dec_and_test(&blkg->refcnt))
+		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+}
+
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+					      struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	blkcg = bio_blkcg(bio);
+
+	/* bypass blkg lookup and use @q->root_rl directly for root */
+	if (blkcg == &blkcg_root)
+		goto root_rl;
+
+	/*
+	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+	 * or if either the blkcg or queue is going away.  Fall back to
+	 * root_rl in such cases.
+	 */
+	blkg = blkg_lookup_create(blkcg, q);
+	if (unlikely(IS_ERR(blkg)))
+		goto root_rl;
+
+	blkg_get(blkg);
+	rcu_read_unlock();
+	return &blkg->rl;
+root_rl:
+	rcu_read_unlock();
+	return &q->root_rl;
+}
+
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+	/* root_rl may not have blkg set */
+	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+		blkg_put(rl->blkg);
+}
+
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+	rq->rl = rl;
+}
+
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+	return rq->rl;
+}
+
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+					 struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+
+static inline void blkg_stat_init(struct blkg_stat *stat)
+{
+	u64_stats_init(&stat->syncp);
+}
+
+/**
+ * blkg_stat_add - add a value to a blkg_stat
+ * @stat: target blkg_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller is responsible for synchronizing calls to
+ * this function.
+ */
+static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+{
+	u64_stats_update_begin(&stat->syncp);
+	stat->cnt += val;
+	u64_stats_update_end(&stat->syncp);
+}
+
+/**
+ * blkg_stat_read - read the current value of a blkg_stat
+ * @stat: blkg_stat to read
+ *
+ * Read the current value of @stat.  This function can be called without
+ * synchroniztion and takes care of u64 atomicity.
+ */
+static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+{
+	unsigned int start;
+	uint64_t v;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&stat->syncp);
+		v = stat->cnt;
+	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+
+	return v;
+}
+
+/**
+ * blkg_stat_reset - reset a blkg_stat
+ * @stat: blkg_stat to reset
+ */
+static inline void blkg_stat_reset(struct blkg_stat *stat)
+{
+	stat->cnt = 0;
+}
+
+/**
+ * blkg_stat_merge - merge a blkg_stat into another
+ * @to: the destination blkg_stat
+ * @from: the source
+ *
+ * Add @from's count to @to.
+ */
+static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+{
+	blkg_stat_add(to, blkg_stat_read(from));
+}
+
+static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+{
+	u64_stats_init(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @rw: mask of REQ_{WRITE|SYNC}
+ * @val: value to add
+ *
+ * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+				   int rw, uint64_t val)
+{
+	u64_stats_update_begin(&rwstat->syncp);
+
+	if (rw & REQ_WRITE)
+		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+	if (rw & REQ_SYNC)
+		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+
+	u64_stats_update_end(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it as the return value.
+ * This function can be called without synchronization and takes care of
+ * u64 atomicity.
+ */
+static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+{
+	unsigned int start;
+	struct blkg_rwstat tmp;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+		tmp = *rwstat;
+	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+
+	return tmp;
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction.  This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+
+	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
+/**
+ * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's counts to @to.
+ */
+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+				     struct blkg_rwstat *from)
+{
+	struct blkg_rwstat v = blkg_rwstat_read(from);
+	int i;
+
+	u64_stats_update_begin(&to->syncp);
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		to->cnt[i] += v.cnt[i];
+	u64_stats_update_end(&to->syncp);
+}
+
+#else	/* CONFIG_BLK_CGROUP */
+
+struct blkcg {
+};
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_gq {
+};
+
+struct blkcg_policy {
+};
+
+#define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+	return NULL;
+}
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_drain_queue(struct request_queue *q) { }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+					      struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif	/* CONFIG_BLOCK */
+#endif	/* CONFIG_BLK_CGROUP */
+#endif	/* _BLK_CGROUP_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5ced29cef03f..7f2f54b4587f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -12,7 +12,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
-#include <linux/backing-dev.h>
+#include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
@@ -787,25 +787,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-/*
- * A queue has just exitted congestion.  Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
-static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
-{
-	clear_bdi_congested(&q->backing_dev_info, sync);
-}
-
-/*
- * A queue has just entered congestion.  Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
-static inline void blk_set_queue_congested(struct request_queue *q, int sync)
-{
-	set_bdi_congested(&q->backing_dev_info, sync);
-}
-
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b9cb94c3102a..e7da0aa65b2d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -774,6 +774,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 }
 
 /**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it.  This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	while (true) {
+		css = task_css(task, subsys_id);
+		if (likely(css_tryget_online(css)))
+			break;
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return css;
+}
+
+/**
  * task_css_is_root - test whether a task belongs to the root css
  * @task: the target task
  * @subsys_id: the target subsystem ID
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5db7b1379d17..e351da4a934f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -35,6 +35,7 @@
 #include <uapi/linux/fs.h>
 
 struct backing_dev_info;
+struct bdi_writeback;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -634,6 +635,14 @@ struct inode {
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int			i_wb_frn_winner;
+	u16			i_wb_frn_avg_time;
+	u16			i_wb_frn_history;
+#endif
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
 	union {
@@ -1232,6 +1241,8 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
+/* sb->s_iflags */
+#define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -1270,6 +1281,7 @@ struct super_block {
 	const struct quotactl_ops	*s_qcop;
 	const struct export_operations *s_export_op;
 	unsigned long		s_flags;
+	unsigned long		s_iflags;	/* internal SB_I_* flags */
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
@@ -1806,6 +1818,11 @@ struct super_operations {
  *
  * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
  *
+ * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
+ *			synchronize competing switching instances and to tell
+ *			wb stat updates to grab mapping->tree_lock.  See
+ *			inode_switch_wb_work_fn() for details.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1825,6 +1842,7 @@ struct super_operations {
 #define I_DIRTY_TIME		(1 << 11)
 #define __I_DIRTY_TIME_EXPIRED	12
 #define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH		(1 << 13)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@ -2241,7 +2259,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int sb_is_blkdev_sb(struct super_block *sb);
+
+extern struct super_block *blockdev_superblock;
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c8918114804..73b02b0a8f60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -41,6 +41,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
+	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
@@ -67,6 +68,8 @@ enum mem_cgroup_events_index {
 };
 
 #ifdef CONFIG_MEMCG
+extern struct cgroup_subsys_state *mem_cgroup_root_css;
+
 void mem_cgroup_events(struct mem_cgroup *memcg,
 		       enum mem_cgroup_events_index idx,
 		       unsigned int nr);
@@ -112,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
@@ -195,6 +199,8 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 				     enum mem_cgroup_events_index idx,
 				     unsigned int nr)
@@ -382,6 +388,29 @@ enum {
 	OVER_LIMIT,
 };
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+			 unsigned long *pdirty, unsigned long *pwriteback);
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+	return NULL;
+}
+
+static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+				       unsigned long *pavail,
+				       unsigned long *pdirty,
+				       unsigned long *pwriteback)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 24ad583596d1..99959a34f4f1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@ struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct bdi_writeback;
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1211,10 +1212,13 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_cleaned(struct page *page, struct address_space *mapping);
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg);
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
+void cancel_dirty_page(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c..fb0814ca65c7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern void __delete_from_page_cache(struct page *page, void *shadow,
+				     struct mem_cgroup *memcg);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2dd371ec0ca..b333c945e571 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
+#include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -84,18 +86,95 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback *wb;	/* wb this writeback is issued under */
+	struct inode *inode;		/* inode being written out */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int wb_id;			/* current wb id */
+	int wb_lcand_id;		/* last foreign candidate wb id */
+	int wb_tcand_id;		/* this foreign candidate wb id */
+	size_t wb_bytes;		/* bytes written by current wb */
+	size_t wb_lcand_bytes;		/* bytes written by last candidate */
+	size_t wb_tcand_bytes;		/* bytes written by this candidate */
+#endif
 };
 
 /*
+ * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
+ * and are measured against each other in.  There always is one global
+ * domain, global_wb_domain, that every wb in the system is a member of.
+ * This allows measuring the relative bandwidth of each wb to distribute
+ * dirtyable memory accordingly.
+ */
+struct wb_domain {
+	spinlock_t lock;
+
+	/*
+	 * Scale the writeback cache size proportional to the relative
+	 * writeout speed.
+	 *
+	 * We do this by keeping a floating proportion between BDIs, based
+	 * on page writeback completions [end_page_writeback()]. Those
+	 * devices that write out pages fastest will get the larger share,
+	 * while the slower will get a smaller share.
+	 *
+	 * We use page writeout completions because we are interested in
+	 * getting rid of dirty pages. Having them written out is the
+	 * primary goal.
+	 *
+	 * We introduce a concept of time, a period over which we measure
+	 * these events, because demand can/will vary over time. The length
+	 * of this period itself is measured in page writeback completions.
+	 */
+	struct fprop_global completions;
+	struct timer_list period_timer;	/* timer for aging of completions */
+	unsigned long period_time;
+
+	/*
+	 * The dirtyable memory and dirty threshold could be suddenly
+	 * knocked down by a large amount (eg. on the startup of KVM in a
+	 * swapless system). This may throw the system into deep dirty
+	 * exceeded state and throttle heavy/light dirtiers alike. To
+	 * retain good responsiveness, maintain global_dirty_limit for
+	 * tracking slowly down to the knocked down dirty threshold.
+	 *
+	 * Both fields are protected by ->lock.
+	 */
+	unsigned long dirty_limit_tstamp;
+	unsigned long dirty_limit;
+};
+
+/**
+ * wb_domain_size_changed - memory available to a wb_domain has changed
+ * @dom: wb_domain of interest
+ *
+ * This function should be called when the amount of memory available to
+ * @dom has changed.  It resets @dom's dirty limit parameters to prevent
+ * the past values which don't match the current configuration from skewing
+ * dirty throttling.  Without this, when memory size of a wb_domain is
+ * greatly reduced, the dirty throttling logic may allow too many pages to
+ * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
+ * that situation.
+ */
+static inline void wb_domain_size_changed(struct wb_domain *dom)
+{
+	spin_lock(&dom->lock);
+	dom->dirty_limit_tstamp = jiffies;
+	dom->dirty_limit = 0;
+	spin_unlock(&dom->lock);
+}
+
+/*
  * fs/fs-writeback.c
  */	
 struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
-int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-				  enum wb_reason reason);
+bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
@@ -107,6 +186,123 @@ static inline void wait_on_inode(struct inode *inode)
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/cgroup.h>
+#include <linux/bio.h>
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+	__releases(&inode->i_lock);
+void wbc_detach_inode(struct writeback_control *wbc);
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+	if (!inode->i_wb)
+		__inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+	if (inode->i_wb) {
+		wb_put(inode->i_wb);
+		inode->i_wb = NULL;
+	}
+}
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode_attach_wb(inode, NULL);
+	wbc_attach_and_unlock_inode(wbc, inode);
+}
+
+/**
+ * wbc_init_bio - writeback specific initializtion of bio
+ * @wbc: writeback_control for the writeback in progress
+ * @bio: bio to be initialized
+ *
+ * @bio is a part of the writeback in progress controlled by @wbc.  Perform
+ * writeback specific initialization.  This is used to apply the cgroup
+ * writeback context.
+ */
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (wbc->wb)
+		bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+	__releases(&inode->i_lock)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+}
+
+static inline void wbc_detach_inode(struct writeback_control *wbc)
+{
+}
+
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+}
+
+static inline void wbc_account_io(struct writeback_control *wbc,
+				  struct page *page, size_t bytes)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * mm/page-writeback.c
  */
@@ -120,8 +316,12 @@ static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 bool zone_dirty_ok(struct zone *zone);
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom);
+#endif
 
-extern unsigned long global_dirty_limit;
+extern struct wb_domain global_wb_domain;
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
@@ -155,19 +355,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
-			       unsigned long dirty);
-
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-			    unsigned long thresh,
-			    unsigned long bg_thresh,
-			    unsigned long dirty,
-			    unsigned long bdi_thresh,
-			    unsigned long bdi_dirty,
-			    unsigned long start_time);
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
+bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 				void *data);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index c178d13d6f4c..a7aa607a4c55 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -360,7 +360,7 @@ TRACE_EVENT(global_dirty_state,
 		__entry->nr_written	= global_page_state(NR_WRITTEN);
 		__entry->background_thresh = background_thresh;
 		__entry->dirty_thresh	= dirty_thresh;
-		__entry->dirty_limit = global_dirty_limit;
+		__entry->dirty_limit	= global_wb_domain.dirty_limit;
 	),
 
 	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@ -399,13 +399,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 
 	TP_fast_assign(
 		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-		__entry->write_bw	= KBps(bdi->write_bandwidth);
-		__entry->avg_write_bw	= KBps(bdi->avg_write_bandwidth);
+		__entry->write_bw	= KBps(bdi->wb.write_bandwidth);
+		__entry->avg_write_bw	= KBps(bdi->wb.avg_write_bandwidth);
 		__entry->dirty_rate	= KBps(dirty_rate);
-		__entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+		__entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->balanced_dirty_ratelimit =
-					  KBps(bdi->balanced_dirty_ratelimit);
+					KBps(bdi->wb.balanced_dirty_ratelimit);
 	),
 
 	TP_printk("bdi %s: "
@@ -462,8 +462,9 @@ TRACE_EVENT(balance_dirty_pages,
 		unsigned long freerun = (thresh + bg_thresh) / 2;
 		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
 
-		__entry->limit		= global_dirty_limit;
-		__entry->setpoint	= (global_dirty_limit + freerun) / 2;
+		__entry->limit		= global_wb_domain.dirty_limit;
+		__entry->setpoint	= (global_wb_domain.dirty_limit +
+						freerun) / 2;
 		__entry->dirty		= dirty;
 		__entry->bdi_setpoint	= __entry->setpoint *
 						bdi_thresh / (thresh + 1);
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-25 16:00:17 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-06-25 16:00:17 -0700
commit	e4bc13adfd016fc1036838170288b5680d1a98b0 (patch)
tree	8d2cb749397749439732f3a827cb7f2336408337 /include
parent	ad90fb97515b732bc27a0109baa10af636c3c8cd (diff)
parent	3e1534cf4a2a8278e811e7c84a79da1a02347b8b (diff)
download	linux-e4bc13adfd016fc1036838170288b5680d1a98b0.tar.gz