From eea8f41cc58849e354ecf8b95bd7f806e1d1f703 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:17 -0400 Subject: blkcg: move block/blk-cgroup.h to include/linux/blk-cgroup.h cgroup aware writeback support will require exposing some of blkcg details. In preprataion, move block/blk-cgroup.h to include/linux/blk-cgroup.h. This patch is pure file move. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-cgroup.h | 603 --------------------------------------------------- block/blk-core.c | 2 +- block/blk-sysfs.c | 2 +- block/blk-throttle.c | 2 +- block/cfq-iosched.c | 2 +- block/elevator.c | 2 +- 7 files changed, 6 insertions(+), 609 deletions(-) delete mode 100644 block/blk-cgroup.h (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ac817b750db..c3226ce8c5f6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -19,7 +19,7 @@ #include #include #include -#include "blk-cgroup.h" +#include #include "blk.h" #define MAX_KEY_LEN 100 diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h deleted file mode 100644 index c567865b5f1d..000000000000 --- a/block/blk-cgroup.h +++ /dev/null @@ -1,603 +0,0 @@ -#ifndef _BLK_CGROUP_H -#define _BLK_CGROUP_H -/* - * Common Block IO controller cgroup interface - * - * Based on ideas and code from CFQ, CFS and BFQ: - * Copyright (C) 2003 Jens Axboe - * - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * - * Copyright (C) 2009 Vivek Goyal - * Nauman Rafique - */ - -#include -#include -#include -#include -#include -#include - -/* Max limits for throttle policy */ -#define THROTL_IOPS_MAX UINT_MAX - -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_BLK_CGROUP - -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - -struct blkcg_gq; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - - struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; - struct hlist_head blkg_list; - - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; -}; - -struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; -}; - -struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; -}; - -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. - */ -struct blkg_policy_data { - /* the blkg and policy id this per-policy data belongs to */ - struct blkcg_gq *blkg; - int plid; - - /* used during policy activation */ - struct list_head alloc_node; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* request allocation list for this blkcg-q pair */ - struct request_list rl; - - /* reference count */ - atomic_t refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - struct rcu_head rcu_head; -}; - -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); - -struct blkcg_policy { - int plid; - /* policy specific private data size */ - size_t pd_size; - /* cgroup files for the policy */ - struct cftype *cftypes; - - /* operations */ - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_online_pd_fn *pd_online_fn; - blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; -}; - -extern struct blkcg blkcg_root; - -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -int blkcg_init_queue(struct request_queue *q); -void blkcg_drain_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); - -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); - -struct blkg_conf_ctx { - struct gendisk *disk; - struct blkcg_gq *blkg; - u64 v; -}; - -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); - - -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); -} - -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - return task_blkcg(current); -} - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(blkcg->css.parent); -} - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); -} - -void __blkg_release_rcu(struct rcu_head *rcu); - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); -} - -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_css by calling css_rightmost_descendant() to skip subtree. - * @p_blkg is included in the iteration and the first node to be visited. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blkg_for_each_descendant_post - post-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. @p_blkg is - * included in the iteration and the last node to be visited. - */ -#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blk_get_rl - get request_list to use - * @q: request_queue of interest - * @bio: bio which will be attached to the allocated request (may be %NULL) - * - * The caller wants to allocate a request from @q to use for @bio. Find - * the request_list to use and obtain a reference on it. Should be called - * under queue_lock. This function is guaranteed to return non-%NULL - * request_list. - */ -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) -{ - struct blkcg *blkcg; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - - /* bypass blkg lookup and use @q->root_rl directly for root */ - if (blkcg == &blkcg_root) - goto root_rl; - - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) - goto root_rl; - - blkg_get(blkg); - rcu_read_unlock(); - return &blkg->rl; -root_rl: - rcu_read_unlock(); - return &q->root_rl; -} - -/** - * blk_put_rl - put request_list - * @rl: request_list to put - * - * Put the reference acquired by blk_get_rl(). Should be called under - * queue_lock. - */ -static inline void blk_put_rl(struct request_list *rl) -{ - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) - blkg_put(rl->blkg); -} - -/** - * blk_rq_set_rl - associate a request with a request_list - * @rq: request of interest - * @rl: target request_list - * - * Associate @rq with @rl so that accounting and freeing can know the - * request_list @rq came from. - */ -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) -{ - rq->rl = rl; -} - -/** - * blk_rq_rl - return the request_list a request came from - * @rq: request of interest - * - * Return the request_list @rq is allocated from. - */ -static inline struct request_list *blk_rq_rl(struct request *rq) -{ - return rq->rl; -} - -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q); -/** - * blk_queue_for_each_rl - iterate through all request_lists of a request_queue - * - * Should be used under queue_lock. - */ -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) - -static inline void blkg_stat_init(struct blkg_stat *stat) -{ - u64_stats_init(&stat->syncp); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - stat->cnt = 0; -} - -/** - * blkg_stat_merge - merge a blkg_stat into another - * @to: the destination blkg_stat - * @from: the source - * - * Add @from's count to @to. - */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) -{ - blkg_stat_add(to, blkg_stat_read(from)); -} - -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) -{ - u64_stats_init(&rwstat->syncp); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @rw: mask of REQ_{WRITE|SYNC} - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int rw, uint64_t val) -{ - u64_stats_update_begin(&rwstat->syncp); - - if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; - else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; - - u64_stats_update_end(&rwstat->syncp); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. - */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) -{ - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); - - return tmp; -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); -} - -/** - * blkg_rwstat_merge - merge a blkg_rwstat into another - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's counts to @to. - */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - struct blkg_rwstat v = blkg_rwstat_read(from); - int i; - - u64_stats_update_begin(&to->syncp); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); -} - -#else /* CONFIG_BLK_CGROUP */ - -struct cgroup; -struct blkcg; - -struct blkg_policy_data { -}; - -struct blkcg_gq { -}; - -struct blkcg_policy { -}; - -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_drain_queue(struct request_queue *q) { } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } - -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) { return &q->root_rl; } -static inline void blk_put_rl(struct request_list *rl) { } -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } -static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } - -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) - -#endif /* CONFIG_BLK_CGROUP */ -#endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index aa819a58ea24..4afac14d4499 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -32,12 +32,12 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index faaf36ade7eb..5677eb78557d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -8,9 +8,9 @@ #include #include #include +#include #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" struct queue_sysfs_entry { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5b9c6d5c3636..b23193518ac7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -9,7 +9,7 @@ #include #include #include -#include "blk-cgroup.h" +#include #include "blk.h" /* Max dispatch from a group in 1 round */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5da8e6e9ab4b..bc8f42930773 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,8 +14,8 @@ #include #include #include +#include #include "blk.h" -#include "blk-cgroup.h" /* * tunables diff --git a/block/elevator.c b/block/elevator.c index 59794d0d38e3..3bbb48f430e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -35,11 +35,11 @@ #include #include #include +#include #include #include "blk.h" -#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); -- cgit 1.4.1 From ec13b1d6f0a0457312e615335ce8ceb07da50a11 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:19 -0400 Subject: blkcg: always create the blkcg_gq for the root blkcg Currently, blkcg does a minor optimization where the root blkcg is created when the first blkcg policy is activated on a queue and destroyed on the deactivation of the last. On systems where blkcg is configured but not used, this saves one blkcg_gq struct per queue. On systems where blkcg is actually used, there's no difference. The only case where this can lead to any meaninful, albeit still minute, save in memory consumption is when all blkcg policies are deactivated after being widely used in the system, which is a hihgly unlikely scenario. The conditional existence of root blkcg_gq has already created several bugs in blkcg and became an issue once again for the new per-cgroup wb_congested mechanism for cgroup writeback support leading to a NULL dereference when no blkcg policy is active. This is really not worth bothering with. This patch makes blkcg always allocate and link the root blkcg_gq and release it only on queue destruction. Signed-off-by: Tejun Heo Reported-by: Fengguang Wu Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 96 +++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 55 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c3226ce8c5f6..2a4f77f2b229 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -235,13 +235,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg->online = true; spin_unlock(&blkcg->lock); - if (!ret) { - if (blkcg == &blkcg_root) { - q->root_blkg = blkg; - q->root_rl.blkg = blkg; - } + if (!ret) return blkg; - } /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); @@ -339,15 +334,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); - /* - * If root blkg is destroyed. Just clear the pointer since root_rl - * does not take reference on root blkg. - */ - if (blkcg == &blkcg_root) { - blkg->q->root_blkg = NULL; - blkg->q->root_rl.blkg = NULL; - } - /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. @@ -855,9 +841,45 @@ done: */ int blkcg_init_queue(struct request_queue *q) { - might_sleep(); + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); - return blk_throtl_init(q); + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code insertion. + */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + if (preloaded) + radix_tree_preload_end(); + + if (IS_ERR(blkg)) { + kfree(new_blkg); + return PTR_ERR(blkg); + } + + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + + ret = blk_throtl_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + } + return ret; } /** @@ -958,52 +980,20 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - struct blkcg_gq *blkg, *new_blkg; + struct blkcg_gq *blkg; struct blkg_policy_data *pd, *n; int cnt = 0, ret; - bool preloaded; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - + /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; - spin_unlock_irq(q->queue_lock); - /* allocate policy_data for all existing blkgs */ while (cnt--) { pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); if (!pd) { @@ -1072,10 +1062,6 @@ void blkcg_deactivate_policy(struct request_queue *q, __clear_bit(pol->plid, q->blkcg_pols); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); - list_for_each_entry(blkg, &q->blkg_list, q_node) { /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); -- cgit 1.4.1 From 496d5e7560dbb84399dbd92316fc33857aa83900 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:21 -0400 Subject: blkcg: add blkcg_root_css Add global constant blkcg_root_css which points to &blkcg_root.css. This will be used by cgroup writeback support. If blkcg is disabled, it's defined as ERR_PTR(-EINVAL). v2: The declarations moved to include/linux/blk-cgroup.h as suggested by Vivek. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 ++ include/linux/blk-cgroup.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2a4f77f2b229..54ec1721cd3e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,6 +30,8 @@ struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; EXPORT_SYMBOL_GPL(blkcg_root); +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static bool blkcg_policy_enabled(struct request_queue *q, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 51f95b34d3f0..65f0c178fd04 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -134,6 +134,7 @@ struct blkcg_policy { }; extern struct blkcg blkcg_root; +extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, @@ -570,6 +571,8 @@ struct blkcg_gq { struct blkcg_policy { }; +#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) + #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit 1.4.1 From ec438699a9ae0856c2ce20a50dd39cdc7e92a732 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:22 -0400 Subject: cgroup, block: implement task_get_css() and use it in bio_associate_current() bio_associate_current() currently open codes task_css() and css_tryget_online() to find and pin $current's blkcg css. Abstract it into task_get_css() which is implemented from cgroup side. As a task is always associated with an online css for every subsystem except while the css_set update is propagating, task_get_css() retries till css_tryget_online() succeeds. This is a cleanup and shouldn't lead to noticeable behavior changes. Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Jens Axboe Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/bio.c | 11 +---------- include/linux/cgroup.h | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 259197d97de1..c4f87018fb79 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2004,7 +2004,6 @@ EXPORT_SYMBOL(bioset_create_nobvec); int bio_associate_current(struct bio *bio) { struct io_context *ioc; - struct cgroup_subsys_state *css; if (bio->bi_ioc) return -EBUSY; @@ -2013,17 +2012,9 @@ int bio_associate_current(struct bio *bio) if (!ioc) return -ENOENT; - /* acquire active ref on @ioc and associate */ get_io_context_active(ioc); bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_cgrp_id); - if (css && css_tryget_online(css)) - bio->bi_css = css; - rcu_read_unlock(); - + bio->bi_css = task_get_css(current, blkio_cgrp_id); return 0; } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b9cb94c3102a..e7da0aa65b2d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -773,6 +773,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, return task_css_check(task, subsys_id, false); } +/** + * task_get_css - find and get the css for (task, subsys) + * @task: the target task + * @subsys_id: the target subsystem ID + * + * Find the css for the (@task, @subsys_id) combination, increment a + * reference on and return it. This function is guaranteed to return a + * valid css. + */ +static inline struct cgroup_subsys_state * +task_get_css(struct task_struct *task, int subsys_id) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + while (true) { + css = task_css(task, subsys_id); + if (likely(css_tryget_online(css))) + break; + cpu_relax(); + } + rcu_read_unlock(); + return css; +} + /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task -- cgit 1.4.1 From 1d933cf096e3aea15f1aec8297657b7a846fab63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:24 -0400 Subject: blkcg: implement bio_associate_blkcg() Currently, a bio can only be associated with the io_context and blkcg of %current using bio_associate_current(). This is too restrictive for cgroup writeback support. Implement bio_associate_blkcg() which associates a bio with the specified blkcg. bio_associate_blkcg() leaves the io_context unassociated. bio_associate_current() is updated so that it considers a bio as already associated if it has a blkcg_css, instead of an io_context, associated with it. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/bio.c | 24 +++++++++++++++++++++++- include/linux/bio.h | 3 +++ 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index c4f87018fb79..2a00d349cd68 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1988,6 +1988,28 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_ EXPORT_SYMBOL(bioset_create_nobvec); #ifdef CONFIG_BLK_CGROUP + +/** + * bio_associate_blkcg - associate a bio with the specified blkcg + * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. + * + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. + */ +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +{ + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; +} + /** * bio_associate_current - associate a bio with %current * @bio: target bio @@ -2005,7 +2027,7 @@ int bio_associate_current(struct bio *bio) { struct io_context *ioc; - if (bio->bi_ioc) + if (bio->bi_css) return -EBUSY; ioc = current->io_context; diff --git a/include/linux/bio.h b/include/linux/bio.h index f0291cf64cc5..5e963a6d7c14 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); #ifdef CONFIG_BLK_CGROUP +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ +static inline int bio_associate_blkcg(struct bio *bio, + struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } #endif /* CONFIG_BLK_CGROUP */ -- cgit 1.4.1 From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:26 -0400 Subject: writeback: move backing_dev_info->state into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->state into wb. * enum bdi_state is renamed to wb_state and the prefix of all enums is changed from BDI_ to WB_. * Explicit zeroing of bdi->state is removed without adding zeoring of wb->state as the whole data structure is zeroed on init anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: drbd-dev@lists.linbit.com Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - drivers/block/drbd/drbd_main.c | 10 +++++----- drivers/md/dm.c | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 2 +- fs/fs-writeback.c | 14 +++++++------- include/linux/backing-dev.h | 24 ++++++++++++------------ mm/backing-dev.c | 20 ++++++++++---------- 8 files changed, 38 insertions(+), 39 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 4afac14d4499..c3ba9c3b08d4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -621,7 +621,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = 0; q->backing_dev_info.name = "block"; q->node = node_id; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 81fde9ef7f8e..a1518539b858 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2359,7 +2359,7 @@ static void drbd_cleanup(void) * @congested_data: User data * @bdi_bits: Bits the BDI flusher thread is currently interested in * - * Returns 1<connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); /* Without good local data, we would need to read from remote, * and that would need the worker thread as well, which is * currently blocked waiting for that usermode helper to * finish. */ if (!get_ldev_if_state(device, D_UP_TO_DATE)) - r |= (1 << BDI_sync_congested); + r |= (1 << WB_sync_congested); else put_ldev(device); r &= bdi_bits; @@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits) reason = 'b'; } - if (bdi_bits & (1 << BDI_async_congested) && + if (bdi_bits & (1 << WB_async_congested) && test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); reason = reason == 'b' ? 'a' : 'n'; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 38837f8ea327..2161ed9329c4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2074,7 +2074,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * the query about congestion status of request_queue */ if (dm_request_based(md)) - r = md->queue->backing_dev_info.state & + r = md->queue->backing_dev_info.wb.state & bdi_bits; else r = dm_table_any_congested(map, bdi_bits); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9157a29c8dbf..f80f1af61ce7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits) struct r1conf *conf = mddev->private; int i, ret = 0; - if ((bits & (1 << BDI_async_congested)) && + if ((bits & (1 << WB_async_congested)) && conf->pending_count >= max_queued_requests) return 1; @@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits) /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ - if ((bits & (1<backing_dev_info, bits); else ret &= bdi_congested(&q->backing_dev_info, bits); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e793ab6b3570..fca825718f29 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits) struct r10conf *conf = mddev->private; int i, ret = 0; - if ((bits & (1 << BDI_async_congested)) && + if ((bits & (1 << WB_async_congested)) && conf->pending_count >= max_queued_requests) return 1; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 32a8bbd7a9ad..983312cea245 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -74,7 +74,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60; */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_writeback_running, &bdi->state); + return test_bit(WB_writeback_running, &bdi->wb.state); } EXPORT_SYMBOL(writeback_in_progress); @@ -112,7 +112,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); static void bdi_wakeup_thread(struct backing_dev_info *bdi) { spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) + if (test_bit(WB_registered, &bdi->wb.state)) mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); spin_unlock_bh(&bdi->wb_lock); } @@ -123,7 +123,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, trace_writeback_queue(bdi, work); spin_lock_bh(&bdi->wb_lock); - if (!test_bit(BDI_registered, &bdi->state)) { + if (!test_bit(WB_registered, &bdi->wb.state)) { if (work->done) complete(work->done); goto out_unlock; @@ -1057,7 +1057,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) struct wb_writeback_work *work; long wrote = 0; - set_bit(BDI_writeback_running, &wb->bdi->state); + set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(bdi)) != NULL) { trace_writeback_exec(bdi, work); @@ -1079,7 +1079,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); - clear_bit(BDI_writeback_running, &wb->bdi->state); + clear_bit(WB_writeback_running, &wb->state); return wrote; } @@ -1099,7 +1099,7 @@ void bdi_writeback_workfn(struct work_struct *work) current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - !test_bit(BDI_registered, &bdi->state))) { + !test_bit(WB_registered, &wb->state))) { /* * The normal path. Keep writing back @bdi until its * work_list is empty. Note that this path is also taken @@ -1323,7 +1323,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) spin_unlock(&inode->i_lock); spin_lock(&bdi->wb.list_lock); if (bdi_cap_writeback_dirty(bdi)) { - WARN(!test_bit(BDI_registered, &bdi->state), + WARN(!test_bit(WB_registered, &bdi->wb.state), "bdi-%s not registered\n", bdi->name); /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index aff923ae8c4b..eb14f988a63e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,13 +25,13 @@ struct device; struct dentry; /* - * Bits in backing_dev_info.state + * Bits in bdi_writeback.state */ -enum bdi_state { - BDI_async_congested, /* The async (write) queue is getting full */ - BDI_sync_congested, /* The sync queue is getting full */ - BDI_registered, /* bdi_register() was done */ - BDI_writeback_running, /* Writeback is in progress */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ }; typedef int (congested_fn)(void *, int); @@ -49,6 +49,7 @@ enum bdi_stat_item { struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ + unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct delayed_work dwork; /* work item used for writeback */ @@ -62,7 +63,6 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned long state; /* Always use atomic bitops on this */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ @@ -250,23 +250,23 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->state & bdi_bits); + return (bdi->wb.state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_sync_congested); + return bdi_congested(bdi, 1 << WB_sync_congested); } static inline int bdi_write_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, 1 << BDI_async_congested); + return bdi_congested(bdi, 1 << WB_async_congested); } static inline int bdi_rw_congested(struct backing_dev_info *bdi) { - return bdi_congested(bdi, (1 << BDI_sync_congested) | - (1 << BDI_async_congested)); + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); } enum { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6dc4580df2af..b23cf0ea5912 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -96,7 +96,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_io, nr_more_io, nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->state); + !list_empty(&bdi->bdi_list), bdi->wb.state); #undef K return 0; @@ -280,7 +280,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) + if (test_bit(WB_registered, &bdi->wb.state)) queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); spin_unlock_bh(&bdi->wb_lock); } @@ -315,7 +315,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); - set_bit(BDI_registered, &bdi->state); + set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); @@ -339,7 +339,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { /* Make sure nobody queues further work */ spin_lock_bh(&bdi->wb_lock); - if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + if (!test_and_clear_bit(WB_registered, &bdi->wb.state)) { spin_unlock_bh(&bdi->wb_lock); return; } @@ -492,11 +492,11 @@ static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { - enum bdi_state bit; + enum wb_state bit; wait_queue_head_t *wqh = &congestion_wqh[sync]; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (test_and_clear_bit(bit, &bdi->state)) + bit = sync ? WB_sync_congested : WB_async_congested; + if (test_and_clear_bit(bit, &bdi->wb.state)) atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) @@ -506,10 +506,10 @@ EXPORT_SYMBOL(clear_bdi_congested); void set_bdi_congested(struct backing_dev_info *bdi, int sync) { - enum bdi_state bit; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (!test_and_set_bit(bit, &bdi->state)) + bit = sync ? WB_sync_congested : WB_async_congested; + if (!test_and_set_bit(bit, &bdi->wb.state)) atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); -- cgit 1.4.1 From 66114cad64bf76a155fec1f0fff0de771cf909d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:32 -0400 Subject: writeback: separate out include/linux/backing-dev-defs.h With the planned cgroup writeback support, backing-dev related declarations will be more widely used across block and cgroup; unfortunately, including backing-dev.h from include/linux/blkdev.h makes cyclic include dependency quite likely. This patch separates out backing-dev-defs.h which only has the essential definitions and updates blkdev.h to include it. c files which need access to more backing-dev details now include backing-dev.h directly. This takes backing-dev.h off the common include dependency chain making it a lot easier to use it across block and cgroup. v2: fs/fat build failure fixed. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-integrity.c | 1 + block/blk-sysfs.c | 1 + block/bounce.c | 1 + block/genhd.c | 1 + drivers/block/drbd/drbd_int.h | 1 + drivers/block/pktcdvd.c | 1 + drivers/char/raw.c | 1 + drivers/md/bcache/request.c | 1 + drivers/md/dm.h | 1 + drivers/md/md.h | 1 + drivers/mtd/devices/block2mtd.c | 1 + fs/block_dev.c | 1 + fs/ext4/extents.c | 1 + fs/ext4/mballoc.c | 1 + fs/ext4/super.c | 1 + fs/f2fs/segment.h | 1 + fs/fat/file.c | 1 + fs/fat/inode.c | 1 + fs/hfs/super.c | 1 + fs/hfsplus/super.c | 1 + fs/nfs/filelayout/filelayout.c | 1 + fs/ocfs2/file.c | 1 + fs/reiserfs/super.c | 1 + fs/ufs/super.c | 1 + fs/xfs/xfs_file.c | 1 + include/linux/backing-dev-defs.h | 106 +++++++++++++++++++++++++++++++++++++++ include/linux/backing-dev.h | 102 +------------------------------------ include/linux/blkdev.h | 2 +- mm/madvise.c | 1 + 29 files changed, 134 insertions(+), 102 deletions(-) create mode 100644 include/linux/backing-dev-defs.h (limited to 'block') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 79ffb4855af0..f548b64be092 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -21,6 +21,7 @@ */ #include +#include #include #include #include diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5677eb78557d..1b60941dc4c6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/block/bounce.c b/block/bounce.c index 4bac72579c1f..072280b3dd13 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/block/genhd.c b/block/genhd.c index 0a536dc05f3b..d46ba566d62f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b905e9888b88..efd19c2da9c2 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 09e628dafd9d..4c20c228184c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 5fc291c6157e..60316fbaf295 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 1616f668a4cb..4afb2d26b148 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -15,6 +15,7 @@ #include #include #include +#include #include diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e6e66d087b26..7fff744f0865 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/md.h b/drivers/md/md.h index 4046a6c6f223..7da6e9c3cb53 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -16,6 +16,7 @@ #define _MD_MD_H #include +#include #include #include #include diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index b16f3cda97ff..e2c0057737e6 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/block_dev.c b/fs/block_dev.c index c7e4163ede87..e545cbfbe5b2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d74e08029643..e8b5866ffa07 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d1e60214ef0..440987c8ba9e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #ifdef CONFIG_EXT4_DEBUG diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f06d0589ddba..56b8bb75c3fc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 6408989857e0..aba72f7a8ac4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ #include +#include /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) diff --git a/fs/fat/file.c b/fs/fat/file.c index 442d50a0e33e..a08f1039909a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include "fat.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c06774658345..509411dd3698 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "fat.h" diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eee7206c38d1..55c03b9e9070 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 593af2fdcc2d..7302d96ae8bf 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a46bf6de9ce4..b34f2e228601 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -32,6 +32,7 @@ #include #include #include +#include #include diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d8b670cbd909..8f1feca89fb0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -37,6 +37,7 @@ #include #include #include +#include #include diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0111ad0466ed..3e0af317fcc3 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -21,6 +21,7 @@ #include "xattr.h" #include #include +#include #include #include #include diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b3bc3e7ae79d..098508a93c7b 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75352ee..4e00b38efbe0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -41,6 +41,7 @@ #include #include #include +#include static const struct vm_operations_struct xfs_file_vm_ops; diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h new file mode 100644 index 000000000000..aa18c4bd43c1 --- /dev/null +++ b/include/linux/backing-dev-defs.h @@ -0,0 +1,106 @@ +#ifndef __LINUX_BACKING_DEV_DEFS_H +#define __LINUX_BACKING_DEV_DEFS_H + +#include +#include +#include +#include +#include +#include + +struct page; +struct device; +struct dentry; + +/* + * Bits in bdi_writeback.state + */ +enum wb_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ + WB_registered, /* bdi_register() was done */ + WB_writeback_running, /* Writeback is in progress */ +}; + +typedef int (congested_fn)(void *, int); + +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS +}; + +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) + +struct bdi_writeback { + struct backing_dev_info *bdi; /* our parent bdi */ + + unsigned long state; /* Always use atomic bitops on this */ + unsigned long last_old_flush; /* last old data flush */ + + struct list_head b_dirty; /* dirty inodes */ + struct list_head b_io; /* parked for writeback */ + struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ + spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; + + unsigned long bw_time_stamp; /* last time write bw is updated */ + unsigned long dirtied_stamp; + unsigned long written_stamp; /* pages written at bw_time_stamp */ + unsigned long write_bandwidth; /* the estimated write bandwidth */ + unsigned long avg_write_bandwidth; /* further smoothed write bw */ + + /* + * The base dirty throttle rate, re-calculated on every 200ms. + * All the bdi tasks' dirty rate will be curbed under it. + * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit + * in small steps and is much more smooth/stable than the latter. + */ + unsigned long dirty_ratelimit; + unsigned long balanced_dirty_ratelimit; + + struct fprop_local_percpu completions; + int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; + + unsigned int min_ratio; + unsigned int max_ratio, max_prop_frac; + + struct bdi_writeback wb; /* default writeback info for this bdi */ + + struct device *dev; + + struct timer_list laptop_mode_wb_timer; + +#ifdef CONFIG_DEBUG_FS + struct dentry *debug_dir; + struct dentry *debug_stats; +#endif +}; + +enum { + BLK_RW_ASYNC = 0, + BLK_RW_SYNC = 1, +}; + +void clear_bdi_congested(struct backing_dev_info *bdi, int sync); +void set_bdi_congested(struct backing_dev_info *bdi, int sync); + +#endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d796f49ce87a..5e39f7a8efed 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -8,104 +8,11 @@ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H -#include -#include -#include #include #include #include -#include #include -#include -#include -#include - -struct page; -struct device; -struct dentry; - -/* - * Bits in bdi_writeback.state - */ -enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ - WB_registered, /* bdi_register() was done */ - WB_writeback_running, /* Writeback is in progress */ -}; - -typedef int (congested_fn)(void *, int); - -enum wb_stat_item { - WB_RECLAIMABLE, - WB_WRITEBACK, - WB_DIRTIED, - WB_WRITTEN, - NR_WB_STAT_ITEMS -}; - -#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) - -struct bdi_writeback { - struct backing_dev_info *bdi; /* our parent bdi */ - - unsigned long state; /* Always use atomic bitops on this */ - unsigned long last_old_flush; /* last old data flush */ - - struct list_head b_dirty; /* dirty inodes */ - struct list_head b_io; /* parked for writeback */ - struct list_head b_more_io; /* parked for more writeback */ - struct list_head b_dirty_time; /* time stamps are dirty */ - spinlock_t list_lock; /* protects the b_* lists */ - - struct percpu_counter stat[NR_WB_STAT_ITEMS]; - - unsigned long bw_time_stamp; /* last time write bw is updated */ - unsigned long dirtied_stamp; - unsigned long written_stamp; /* pages written at bw_time_stamp */ - unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ - - /* - * The base dirty throttle rate, re-calculated on every 200ms. - * All the bdi tasks' dirty rate will be curbed under it. - * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit - * in small steps and is much more smooth/stable than the latter. - */ - unsigned long dirty_ratelimit; - unsigned long balanced_dirty_ratelimit; - - struct fprop_local_percpu completions; - int dirty_exceeded; - - spinlock_t work_lock; /* protects work_list & dwork scheduling */ - struct list_head work_list; - struct delayed_work dwork; /* work item used for writeback */ -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; - - unsigned int min_ratio; - unsigned int max_ratio, max_prop_frac; - - struct bdi_writeback wb; /* default writeback info for this bdi */ - - struct device *dev; - - struct timer_list laptop_mode_wb_timer; - -#ifdef CONFIG_DEBUG_FS - struct dentry *debug_dir; - struct dentry *debug_stats; -#endif -}; +#include struct backing_dev_info *inode_to_bdi(struct inode *inode); @@ -265,13 +172,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); int pdflush_proc_obsolete(struct ctl_table *table, int write, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ccaa9aecd593..60d2726a6b62 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index d551475517bf..64bb8a22110c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include -- cgit 1.4.1 From 89e9b9e07a390c50980d10aa37a04631db5a23ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:36 -0400 Subject: writeback: add {CONFIG|BDI_CAP|FS}_CGROUP_WRITEBACK cgroup writeback requires support from both bdi and filesystem sides. Add BDI_CAP_CGROUP_WRITEBACK and FS_CGROUP_WRITEBACK to indicate support and enable BDI_CAP_CGROUP_WRITEBACK on block based bdi's by default. Also, define CONFIG_CGROUP_WRITEBACK which is enabled if both MEMCG and BLK_CGROUP are enabled. inode_cgwb_enabled() which determines whether a given inode's both bdi and fs support cgroup writeback is added. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- include/linux/backing-dev.h | 32 +++++++++++++++++++++++++++++++- include/linux/fs.h | 1 + init/Kconfig | 5 +++++ 4 files changed, 38 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c3ba9c3b08d4..c114a61590bc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -621,7 +621,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.capabilities = 0; + q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bfdaa18ba0a1..6bb31234e6a9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -134,12 +134,15 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_NO_WRITEBACK: Don't write pages back * BDI_CAP_NO_ACCT_WB: Don't automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. + * + * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 #define BDI_CAP_NO_ACCT_WB 0x00000004 #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 +#define BDI_CAP_CGROUP_WRITEBACK 0x00000020 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -229,4 +232,31 @@ static inline int bdi_sched_wait(void *word) return 0; } -#endif /* _LINUX_BACKING_DEV_H */ +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode + * @inode: inode of interest + * + * cgroup writeback requires support from both the bdi and filesystem. + * Test whether @inode has both. + */ +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + return bdi_cap_account_dirty(bdi) && + (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) && + (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool inode_cgwb_enabled(struct inode *inode) +{ + return false; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +#endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index ce100b87fba3..74e0ae0626a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1897,6 +1897,7 @@ struct file_system_type { #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */ +#define FS_CGROUP_WRITEBACK 32 /* Supports cgroup-aware writeback */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); diff --git a/init/Kconfig b/init/Kconfig index dc24dec60232..d4f763332f9f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1141,6 +1141,11 @@ config DEBUG_BLK_CGROUP Enable some debugging help. Currently it exports additional stat files in a cgroup which can be useful for debugging. +config CGROUP_WRITEBACK + bool + depends on MEMCG && BLK_CGROUP + default y + endif # CGROUPS config CHECKPOINT_RESTORE -- cgit 1.4.1 From 52ebea749aaed195245701a8f90a23d672c7a933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:37 -0400 Subject: writeback: make backing_dev_info host cgroup-specific bdi_writebacks For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo Cc: kbuild test robot Cc: Dan Carpenter Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 7 +- fs/fs-writeback.c | 8 +- fs/inode.c | 1 + include/linux/backing-dev-defs.h | 59 +++++- include/linux/backing-dev.h | 195 +++++++++++++++++++ include/linux/blk-cgroup.h | 4 + include/linux/fs.h | 4 + include/linux/memcontrol.h | 4 + mm/backing-dev.c | 397 +++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 19 +- mm/page-writeback.c | 11 +- 11 files changed, 698 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 54ec1721cd3e..979cfdbb94e0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) @@ -827,7 +830,9 @@ done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&blkcg->blkg_list); - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif return &blkcg->css; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 34d1cb854fc1..99a2440c5588 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); - spin_lock(&bdi->wb.list_lock); + spin_lock(&wb->list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + spin_unlock(&wb->list_lock); } /* @@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + inode_attach_wb(inode, NULL); + if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; diff --git a/fs/inode.c b/fs/inode.c index ea37cd17b53f..efc9edacfb9b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode->i_flctx); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 9e9eafa5f5aa..a1e9c407a59a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -2,8 +2,11 @@ #define __LINUX_BACKING_DEV_DEFS_H #include +#include +#include #include #include +#include #include #include #include @@ -37,10 +40,43 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * For cgroup writeback, multiple wb's may map to the same blkcg. Those + * wb's can operate mostly independently but should share the congested + * state. To facilitate such sharing, the congested state is tracked using + * the following struct which is created on demand, indexed by blkcg ID on + * its bdi, and refcounted. + */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct backing_dev_info *bdi; /* the associated bdi */ + atomic_t refcnt; /* nr of attached wb's and blkg */ + int blkcg_id; /* ID of the associated blkcg */ + struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ +#endif }; +/* + * Each wb (bdi_writeback) can perform writeback operations, is measured + * and throttled, independently. Without cgroup writeback, each bdi + * (bdi_writeback) is served by its embedded bdi->wb. + * + * On the default hierarchy, blkcg implicitly enables memcg. This allows + * using memcg's page ownership for attributing writeback IOs, and every + * memcg - blkcg combination can be served by its own wb by assigning a + * dedicated wb to each memcg, which enables isolation across different + * cgroups and propagation of IO back pressure down from the IO layer upto + * the tasks which are generating the dirty pages to be written back. + * + * A cgroup wb is indexed on its bdi by the ID of the associated memcg, + * refcounted with the number of inodes attached to it, and pins the memcg + * and the corresponding blkcg. As the corresponding blkcg for a memcg may + * change as blkcg is disabled and enabled higher up in the hierarchy, a wb + * is tested for blkcg after lookup and removed from index on mismatch so + * that a new wb for the combination can be created. + */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -78,6 +114,19 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct percpu_ref refcnt; /* used only for !root wb's */ + struct cgroup_subsys_state *memcg_css; /* the associated memcg */ + struct cgroup_subsys_state *blkcg_css; /* and blkcg */ + struct list_head memcg_node; /* anchored at memcg->cgwb_list */ + struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + + union { + struct work_struct release_work; + struct rcu_head rcu; + }; +#endif }; struct backing_dev_info { @@ -92,9 +141,13 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - struct bdi_writeback wb; /* default writeback info for this bdi */ - struct bdi_writeback_congested wb_congested; - + struct bdi_writeback wb; /* the root writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; /* its congested state */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ + struct rb_root cgwb_congested_tree; /* their congested states */ + atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#endif struct device *dev; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6bb31234e6a9..8ae59df2e3d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include int __must_check bdi_init(struct backing_dev_info *bdi); @@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word) #ifdef CONFIG_CGROUP_WRITEBACK +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); +void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void __inode_attach_wb(struct inode *inode, struct page *page); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct blkcg *blkcg); + /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest @@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest + * + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. + */ +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; + + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; +} + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. + */ +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return inode->i_wb; +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } +static inline struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + return bdi->wb.congested; +} + +static inline void wb_congested_put(struct bdi_writeback_congested *congested) +{ +} + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + return &bdi->wb; +} + +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + return &bdi->wb; +} + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + +static inline void wb_memcg_offline(struct mem_cgroup *memcg) +{ +} + +static inline void wb_blkcg_offline(struct blkcg *blkcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4dc643f2046e..3033eb173eb4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,6 +53,10 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ unsigned int cfq_leaf_weight; + +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif }; struct blkg_stat { diff --git a/include/linux/fs.h b/include/linux/fs.h index 74e0ae0626a8..67a42ec95065 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ #include struct backing_dev_info; +struct bdi_writeback; struct export_operations; struct hd_geometry; struct iovec; @@ -635,6 +636,9 @@ struct inode { struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ +#endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 637ef626008e..662a953ea8ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -388,6 +388,10 @@ enum { OVER_LIMIT, }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); +#endif + struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) void sock_update_memcg(struct sock *sk); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5ec7658cb984..4c9386c98ec1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -368,6 +368,401 @@ static void wb_exit(struct bdi_writeback *wb) fprop_local_destroy_percpu(&wb->completions); } +#ifdef CONFIG_CGROUP_WRITEBACK + +#include + +/* + * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, + * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU + * protected. cgwb_release_wait is used to wait for the completion of cgwb + * releases from bdi destruction path. + */ +static DEFINE_SPINLOCK(cgwb_lock); +static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); + +/** + * wb_congested_get_create - get or create a wb_congested + * @bdi: associated bdi + * @blkcg_id: ID of the associated blkcg + * @gfp: allocation mask + * + * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. + * The returned wb_congested has its reference count incremented. Returns + * NULL on failure. + */ +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + struct bdi_writeback_congested *new_congested = NULL, *congested; + struct rb_node **node, *parent; + unsigned long flags; + + if (blkcg_id == 1) + return &bdi->wb_congested; +retry: + spin_lock_irqsave(&cgwb_lock, flags); + + node = &bdi->cgwb_congested_tree.rb_node; + parent = NULL; + + while (*node != NULL) { + parent = *node; + congested = container_of(parent, struct bdi_writeback_congested, + rb_node); + if (congested->blkcg_id < blkcg_id) + node = &parent->rb_left; + else if (congested->blkcg_id > blkcg_id) + node = &parent->rb_right; + else + goto found; + } + + if (new_congested) { + /* !found and storage for new one already allocated, insert */ + congested = new_congested; + new_congested = NULL; + rb_link_node(&congested->rb_node, parent, node); + rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); + atomic_inc(&bdi->usage_cnt); + goto found; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + + /* allocate storage for new one and retry */ + new_congested = kzalloc(sizeof(*new_congested), gfp); + if (!new_congested) + return NULL; + + atomic_set(&new_congested->refcnt, 0); + new_congested->bdi = bdi; + new_congested->blkcg_id = blkcg_id; + goto retry; + +found: + atomic_inc(&congested->refcnt); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(new_congested); + return congested; +} + +/** + * wb_congested_put - put a wb_congested + * @congested: wb_congested to put + * + * Put @congested and destroy it if the refcnt reaches zero. + */ +void wb_congested_put(struct bdi_writeback_congested *congested) +{ + struct backing_dev_info *bdi = congested->bdi; + unsigned long flags; + + if (congested->blkcg_id == 1) + return; + + local_irq_save(flags); + if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + local_irq_restore(flags); + return; + } + + rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(congested); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(work, struct bdi_writeback, + release_work); + struct backing_dev_info *bdi = wb->bdi; + + wb_shutdown(wb); + + css_put(wb->memcg_css); + css_put(wb->blkcg_css); + wb_congested_put(wb->congested); + + percpu_ref_exit(&wb->refcnt); + wb_exit(wb); + kfree_rcu(wb, rcu); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release(struct percpu_ref *refcnt) +{ + struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, + refcnt); + schedule_work(&wb->release_work); +} + +static void cgwb_kill(struct bdi_writeback *wb) +{ + lockdep_assert_held(&cgwb_lock); + + WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + list_del(&wb->memcg_node); + list_del(&wb->blkcg_node); + percpu_ref_kill(&wb->refcnt); +} + +static int cgwb_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct cgroup_subsys_state *blkcg_css; + struct blkcg *blkcg; + struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; + struct bdi_writeback *wb; + unsigned long flags; + int ret = 0; + + memcg = mem_cgroup_from_css(memcg_css); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg = css_to_blkcg(blkcg_css); + memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + blkcg_cgwb_list = &blkcg->cgwb_list; + + /* look up again under lock and discard on blkcg mismatch */ + spin_lock_irqsave(&cgwb_lock, flags); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb && wb->blkcg_css != blkcg_css) { + cgwb_kill(wb); + wb = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (wb) + goto out_put; + + /* need to create a new one */ + wb = kmalloc(sizeof(*wb), gfp); + if (!wb) + return -ENOMEM; + + ret = wb_init(wb, bdi, gfp); + if (ret) + goto err_free; + + ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); + if (ret) + goto err_wb_exit; + + wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); + if (!wb->congested) + goto err_ref_exit; + + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); + + /* + * The root wb determines the registered state of the whole bdi and + * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate + * whether they're still online. Don't link @wb if any is dead. + * See wb_memcg_offline() and wb_blkcg_offline(). + */ + ret = -ENODEV; + spin_lock_irqsave(&cgwb_lock, flags); + if (test_bit(WB_registered, &bdi->wb.state) && + blkcg_cgwb_list->next && memcg_cgwb_list->next) { + /* we might have raced another instance of this function */ + ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + if (!ret) { + atomic_inc(&bdi->usage_cnt); + list_add(&wb->memcg_node, memcg_cgwb_list); + list_add(&wb->blkcg_node, blkcg_cgwb_list); + css_get(memcg_css); + css_get(blkcg_css); + } + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto err_put_congested; + } + goto out_put; + +err_put_congested: + wb_congested_put(wb->congested); +err_ref_exit: + percpu_ref_exit(&wb->refcnt); +err_wb_exit: + wb_exit(wb); +err_free: + kfree(wb); +out_put: + css_put(blkcg_css); + return ret; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. The returned wb has its refcount incremented. + * + * This function uses css_get() on @memcg_css and thus expects its refcnt + * to be positive on invocation. IOW, rcu_read_lock() protection on + * @memcg_css isn't enough. try_get it before calling this function. + * + * A wb is keyed by its associated memcg. As blkcg implicitly enables + * memcg on the default hierarchy, memcg association is guaranteed to be + * more specific (equal or descendant to the associated blkcg) and thus can + * identify both the memcg and blkcg associations. + * + * Because the blkcg associated with a memcg may change as blkcg is enabled + * and disabled closer to root in the hierarchy, each wb keeps track of + * both the memcg and blkcg associated with it and verifies the blkcg on + * each lookup. On mismatch, the existing wb is discarded and a new one is + * created. + */ +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp) +{ + struct bdi_writeback *wb; + + might_sleep_if(gfp & __GFP_WAIT); + + if (!memcg_css->parent) + return &bdi->wb; + + do { + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, + &blkio_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || + !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + + return wb; +} + +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + +static void cgwb_bdi_init(struct backing_dev_info *bdi) +{ + bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.blkcg_css = blkcg_root_css; + bdi->wb_congested.blkcg_id = 1; + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + bdi->cgwb_congested_tree = RB_ROOT; + atomic_set(&bdi->usage_cnt, 1); +} + +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +{ + struct radix_tree_iter iter; + void **slot; + + WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + + spin_lock_irq(&cgwb_lock); + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) + cgwb_kill(*slot); + spin_unlock_irq(&cgwb_lock); + + /* + * All cgwb's and their congested states must be shutdown and + * released before returning. Drain the usage counter to wait for + * all cgwb's and cgwb_congested's ever created on @bdi. + */ + atomic_dec(&bdi->usage_cnt); + wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); +} + +/** + * wb_memcg_offline - kill all wb's associated with a memcg being offlined + * @memcg: memcg being offlined + * + * Also prevents creation of any new wb's associated with @memcg. + */ +void wb_memcg_offline(struct mem_cgroup *memcg) +{ + LIST_HEAD(to_destroy); + struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) + cgwb_kill(wb); + memcg_cgwb_list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +/** + * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined + * @blkcg: blkcg being offlined + * + * Also prevents creation of any new wb's associated with @blkcg. + */ +void wb_blkcg_offline(struct blkcg *blkcg) +{ + LIST_HEAD(to_destroy); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + cgwb_kill(wb); + blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static void cgwb_bdi_init(struct backing_dev_info *bdi) { } +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +#endif /* CONFIG_CGROUP_WRITEBACK */ + int bdi_init(struct backing_dev_info *bdi) { int err; @@ -386,6 +781,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->wb_congested.state = 0; bdi->wb.congested = &bdi->wb_congested; + cgwb_bdi_init(bdi); return 0; } EXPORT_SYMBOL(bdi_init); @@ -459,6 +855,7 @@ void bdi_destroy(struct backing_dev_info *bdi) /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c270a03729c..49e5aa6abca6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -348,6 +348,10 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -4030,6 +4034,15 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) +{ + return &memcg->cgwb_list; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * DO NOT USE IN NEW FILES. * @@ -4494,7 +4507,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); +#endif return &memcg->css; free_out: @@ -4582,6 +4597,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 78ef5511a198..9b95cf80b407 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2097,16 +2097,21 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg) { + struct inode *inode = mapping->host; + trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb; + + inode_attach_wb(inode, page); + wb = inode_to_wb(inode); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_wb_stat(&bdi->wb, WB_RECLAIMABLE); - __inc_wb_stat(&bdi->wb, WB_DIRTIED); + __inc_wb_stat(wb, WB_RECLAIMABLE); + __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); -- cgit 1.4.1 From ce7acfeaf0363c8b75810908448f61af04d38f91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:38 -0400 Subject: writeback, blkcg: associate each blkcg_gq with the corresponding bdi_writeback_congested A blkg (blkcg_gq) can be congested and decongested independently from other blkgs on the same request_queue. Accordingly, for cgroup writeback support, the congestion status at bdi (backing_dev_info) should be split and updated separately from matching blkg's. This patch prepares by adding blkg->wb_congested and associating a blkg with its matching per-blkcg bdi_writeback_congested on creation. v2: Updated to associate bdi_writeback_congested instead of bdi_writeback. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 17 +++++++++++++++-- include/linux/blk-cgroup.h | 6 ++++++ 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 979cfdbb94e0..31610ae0ebff 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -182,6 +182,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; + struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -193,22 +194,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } + wb_congested = wb_congested_get_create(&q->backing_dev_info, + blkcg->css.id, GFP_ATOMIC); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; + } + /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_put_congested; } } blkg = new_blkg; + blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -EINVAL; - goto err_put_css; + goto err_put_congested; } blkg_get(blkg->parent); } @@ -245,6 +254,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); +err_put_congested: + wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -391,6 +402,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head) if (blkg->parent) blkg_put(blkg->parent); + wb_congested_put(blkg->wb_congested); + blkg_free(blkg); } EXPORT_SYMBOL_GPL(__blkg_release_rcu); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3033eb173eb4..07a32b813ed8 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -99,6 +99,12 @@ struct blkcg_gq { struct hlist_node blkcg_node; struct blkcg *blkcg; + /* + * Each blkg gets congested separately and the congestion state is + * propagated to the matching bdi_writeback_congested. + */ + struct bdi_writeback_congested *wb_congested; + /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; -- cgit 1.4.1 From d40f75a06dd675808eed385d490ba9468200b23f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:42 -0400 Subject: writeback, blkcg: restructure blk_{set|clear}_queue_congested() blk_{set|clear}_queue_congested() take @q and set or clear, respectively, the congestion state of its bdi's root wb. Because bdi used to be able to handle congestion state only on the root wb, the callers of those functions tested whether the congestion is on the root blkcg and skipped if not. This is cumbersome and makes implementation of per cgroup bdi_writeback congestion state propagation difficult. This patch renames blk_{set|clear}_queue_congested() to blk_{set|clear}_congested(), and makes them take request_list instead of request_queue and test whether the specified request_list is the root one before updating bdi_writeback congestion state. This makes the tests in the callers unnecessary and simplifies them. As there are no external users of these functions, the definitions are moved from include/linux/blkdev.h to block/blk-core.c. This patch doesn't introduce any noticeable behavior difference. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-core.c | 62 ++++++++++++++++++++++++++++++-------------------- include/linux/blkdev.h | 19 ---------------- 2 files changed, 37 insertions(+), 44 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index c114a61590bc..f46a43ef1a94 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -63,6 +63,28 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static void blk_clear_congested(struct request_list *rl, int sync) +{ + if (rl != &rl->q->root_rl) + return; +#ifdef CONFIG_CGROUP_WRITEBACK + clear_wb_congested(rl->blkg->wb_congested, sync); +#else + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + +static void blk_set_congested(struct request_list *rl, int sync) +{ + if (rl != &rl->q->root_rl) + return; +#ifdef CONFIG_CGROUP_WRITEBACK + set_wb_congested(rl->blkg->wb_congested, sync); +#else + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -842,13 +864,8 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - /* - * bdi isn't aware of blkcg yet. As all async IOs end up root - * blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl && - rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, sync); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) @@ -881,25 +898,25 @@ static void freed_request(struct request_list *rl, unsigned int flags) int blk_update_nr_requests(struct request_queue *q, unsigned int nr) { struct request_list *rl; + int on_thresh, off_thresh; spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); + on_thresh = queue_congestion_on_threshold(q); + off_thresh = queue_congestion_off_threshold(q); - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_SYNC); - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); + if (rl->count[BLK_RW_ASYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_ASYNC); - blk_queue_for_each_rl(rl, q) { if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_SYNC); } else { @@ -1009,12 +1026,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, } } } - /* - * bdi isn't aware of blkcg yet. As all async IOs end up - * root blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl) - blk_set_queue_congested(q, is_sync); + blk_set_congested(rl, is_sync); } /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 60d2726a6b62..ab4a27852f1b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -790,25 +790,6 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern void blk_queue_bio(struct request_queue *q, struct bio *bio); -/* - * A queue has just exitted congestion. Note this in the global counter of - * congested queues, and wake up anyone who was waiting for requests to be - * put back. - */ -static inline void blk_clear_queue_congested(struct request_queue *q, int sync) -{ - clear_bdi_congested(&q->backing_dev_info, sync); -} - -/* - * A queue has just entered congestion. Flag that in the queue's VM-visible - * state flags and increment the global gounter of congested queues. - */ -static inline void blk_set_queue_congested(struct request_queue *q, int sync) -{ - set_bdi_congested(&q->backing_dev_info, sync); -} - extern void blk_start_queue(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -- cgit 1.4.1 From 482cf79cdf6669667a914ffd4cbc57a762b55fef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:43 -0400 Subject: writeback, blkcg: propagate non-root blkcg congestion state Now that bdi layer can handle per-blkcg bdi_writeback_congested state, blk_{set|clear}_congested() can propagate non-root blkcg congestion state to them. This can be easily achieved by disabling the root_rl tests in blk_{set|clear}_congested(). Note that we still need those tests when !CONFIG_CGROUP_WRITEBACK as otherwise we'll end up flipping root blkcg wb's congestion state for events happening on other blkcgs. v2: Updated for bdi_writeback_congested. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f46a43ef1a94..a4a2dbe46fe3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -65,23 +65,26 @@ static struct workqueue_struct *kblockd_workqueue; static void blk_clear_congested(struct request_list *rl, int sync) { - if (rl != &rl->q->root_rl) - return; #ifdef CONFIG_CGROUP_WRITEBACK clear_wb_congested(rl->blkg->wb_congested, sync); #else - clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); + /* + * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't + * flip its congestion state for events on other blkcgs. + */ + if (rl == &rl->q->root_rl) + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); #endif } static void blk_set_congested(struct request_list *rl, int sync) { - if (rl != &rl->q->root_rl) - return; #ifdef CONFIG_CGROUP_WRITEBACK set_wb_congested(rl->blkg->wb_congested, sync); #else - set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); + /* see blk_clear_congested() */ + if (rl == &rl->q->root_rl) + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); #endif } -- cgit 1.4.1