diff options
author | Jiri Kosina <jkosina@suse.cz> | 2016-04-18 11:18:55 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2016-04-18 11:18:55 +0200 |
commit | 9938b04472d5c59f8bd8152a548533a8599596a2 (patch) | |
tree | 0fc8318100878c5e446076613ec02a97aa179119 /block | |
parent | bd7ced98812dbb906950d8b0ec786f14f631cede (diff) | |
parent | c3b46c73264b03000d1e18b22f5caf63332547c9 (diff) | |
download | linux-9938b04472d5c59f8bd8152a548533a8599596a2.tar.gz |
Merge branch 'master' into for-next
Sync with Linus' tree so that patches against newer codebase can be applied. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 13 | ||||
-rw-r--r-- | block/Makefile | 4 | ||||
-rw-r--r-- | block/badblocks.c | 585 | ||||
-rw-r--r-- | block/bio-integrity.c | 13 | ||||
-rw-r--r-- | block/bio.c | 101 | ||||
-rw-r--r-- | block/blk-cgroup.c | 15 | ||||
-rw-r--r-- | block/blk-core.c | 196 | ||||
-rw-r--r-- | block/blk-ioc.c | 2 | ||||
-rw-r--r-- | block/blk-iopoll.c | 224 | ||||
-rw-r--r-- | block/blk-map.c | 91 | ||||
-rw-r--r-- | block/blk-merge.c | 77 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 2 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 25 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 11 | ||||
-rw-r--r-- | block/blk-mq.c | 312 | ||||
-rw-r--r-- | block/blk-mq.h | 13 | ||||
-rw-r--r-- | block/blk-settings.c | 46 | ||||
-rw-r--r-- | block/blk-sysfs.c | 51 | ||||
-rw-r--r-- | block/blk-timeout.c | 25 | ||||
-rw-r--r-- | block/blk.h | 4 | ||||
-rw-r--r-- | block/cfq-iosched.c | 45 | ||||
-rw-r--r-- | block/compat_ioctl.c | 4 | ||||
-rw-r--r-- | block/deadline-iosched.c | 3 | ||||
-rw-r--r-- | block/genhd.c | 32 | ||||
-rw-r--r-- | block/ioctl.c | 37 | ||||
-rw-r--r-- | block/ioprio.c | 6 | ||||
-rw-r--r-- | block/noop-iosched.c | 10 | ||||
-rw-r--r-- | block/partition-generic.c | 48 | ||||
-rw-r--r-- | block/partitions/mac.c | 10 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 6 |
30 files changed, 1402 insertions, 609 deletions
diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879..0363cd731320 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_DAX + bool "Block device DAX support" + depends on FS_DAX + depends on BROKEN + help + When DAX support is available (CONFIG_FS_DAX) raw block + devices can also support direct userspace access to the + storage capacity via MMAP(2) similar to a file on a + DAX-enabled filesystem. However, the DAX I/O-path disables + some standard I/O-statistics, and the MMAP(2) path has some + operational differences due to bypassing the page + cache. If in doubt, say N. + config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y diff --git a/block/Makefile b/block/Makefile index 00ecc97629db..9eda2322b2d4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,10 +5,10 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ - partitions/ + badblocks.o partitions/ obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 000000000000..7be53cb1cc3c --- /dev/null +++ b/block/badblocks.c @@ -0,0 +1,585 @@ +/* + * Bad block management + * + * - Heavily based on MD badblocks code from Neil Brown + * + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/badblocks.h> +#include <linux/seqlock.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/slab.h> + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + /* This could still be the one, earlier ranges + * could not. + */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 0; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range + */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' + */ + if (bb->count >= MAX_BADBLOCKS) { + /* No room for more */ + rv = 1; + break; + } else { + int this_sectors = sectors; + + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_clear); + +/** + * ack_all_badblocks() - Acknowledge all bad blocks in a list. + * @bb: the badblocks structure that holds all badblock information + * + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(ack_all_badblocks); + +/** + * badblocks_show() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of returned data + */ +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} +EXPORT_SYMBOL_GPL(badblocks_show); + +/** + * badblocks_store() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @len: length of data received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of the buffer processed or -ve error. + */ +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack) +{ + unsigned long long sector; + int length; + char newline; + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (badblocks_set(bb, sector, length, !unack)) + return -ENOSPC; + else + return len; +} +EXPORT_SYMBOL_GPL(badblocks_store); + +static int __badblocks_init(struct device *dev, struct badblocks *bb, + int enable) +{ + bb->dev = dev; + bb->count = 0; + if (enable) + bb->shift = 0; + else + bb->shift = -1; + if (dev) + bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); + else + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->page) { + bb->shift = -1; + return -ENOMEM; + } + seqlock_init(&bb->lock); + + return 0; +} + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + return __badblocks_init(NULL, bb, enable); +} +EXPORT_SYMBOL_GPL(badblocks_init); + +int devm_init_badblocks(struct device *dev, struct badblocks *bb) +{ + if (!bb) + return -EINVAL; + return __badblocks_init(dev, bb, 1); +} +EXPORT_SYMBOL_GPL(devm_init_badblocks); + +/** + * badblocks_exit() - free the badblocks structure + * @bb: the badblocks structure that holds all badblock information + */ +void badblocks_exit(struct badblocks *bb) +{ + if (!bb) + return; + if (bb->dev) + devm_kfree(bb->dev, bb->page); + else + kfree(bb->page); + bb->page = NULL; +} +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f6325d573c10..711e4d8de6fa 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, } if (unlikely(!bip)) - return NULL; + return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); @@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: mempool_free(bip, bs->bio_integrity_pool); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio) /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { + if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - return -EIO; + return PTR_ERR(bip); } bip->bip_flags |= BIP_BLOCK_INTEGRITY; @@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; + if (IS_ERR(bip)) + return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); diff --git a/block/bio.c b/block/bio.c index ad3f276d74bc..807d25e466ec 100644 --- a/block/bio.c +++ b/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -296,13 +296,19 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); -static void bio_chain_endio(struct bio *bio) +static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_endio(parent); + if (!parent->bi_error) + parent->bi_error = bio->bi_error; bio_put(bio); + return parent; +} + +static void bio_chain_endio(struct bio *bio) +{ + bio_endio(__bio_chain_endio(bio)); } /* @@ -395,12 +401,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -459,13 +465,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { @@ -874,7 +880,7 @@ int submit_bio_wait(int rw, struct bio *bio) bio->bi_private = &ret; bio->bi_end_io = submit_bio_wait_endio; submit_bio(rw, bio); - wait_for_completion(&ret.event); + wait_for_completion_io(&ret.event); return ret.error; } @@ -1090,9 +1096,12 @@ int bio_uncopy_user(struct bio *bio) if (!bio_flagged(bio, BIO_NULL_MAPPED)) { /* * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. */ - if (current->mm && bio_data_dir(bio) == READ) + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); @@ -1125,7 +1134,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, int i, ret; int nr_pages = 0; unsigned int len = iter->count; - unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; for (i = 0; i < iter->nr_segs; i++) { unsigned long uaddr; @@ -1304,7 +1313,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, goto out_unmap; } - offset = uaddr & ~PAGE_MASK; + offset = offset_in_page(uaddr); for (j = cur_page; j < page_limit; j++) { unsigned int bytes = PAGE_SIZE - offset; @@ -1330,7 +1339,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, * release the pages we didn't map into the bio, if any */ while (j < page_limit) - page_cache_release(pages[j++]); + put_page(pages[j++]); } kfree(pages); @@ -1356,7 +1365,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, for (j = 0; j < nr_pages; j++) { if (!pages[j]) break; - page_cache_release(pages[j]); + put_page(pages[j]); } out: kfree(pages); @@ -1376,7 +1385,7 @@ static void __bio_unmap_user(struct bio *bio) if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); - page_cache_release(bvec->bv_page); + put_page(bvec->bv_page); } bio_put(bio); @@ -1606,8 +1615,8 @@ static void bio_release_pages(struct bio *bio) * the BIO and the offending pages and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one page_cache_release() against each page and will - * run one bio_put() against the BIO. + * here on. It will run one put_page() against each page and will run one + * bio_put() against the BIO. */ static void bio_dirty_fn(struct work_struct *work); @@ -1649,7 +1658,7 @@ void bio_check_pages_dirty(struct bio *bio) struct page *page = bvec->bv_page; if (PageDirty(page) || PageCompound(page)) { - page_cache_release(page); + put_page(page); bvec->bv_page = NULL; } else { nr_clean_pages++; @@ -1739,29 +1748,25 @@ static inline bool bio_remaining_done(struct bio *bio) **/ void bio_endio(struct bio *bio) { - while (bio) { - if (unlikely(!bio_remaining_done(bio))) - break; +again: + if (!bio_remaining_done(bio)) + return; - /* - * Need to have a real endio function for chained bios, - * otherwise various corner cases will break (like stacking - * block devices that save/restore bi_end_io) - however, we want - * to avoid unbounded recursion and blowing the stack. Tail call - * optimization would handle this, but compiling with frame - * pointers also disables gcc's sibling call optimization. - */ - if (bio->bi_end_io == bio_chain_endio) { - struct bio *parent = bio->bi_private; - parent->bi_error = bio->bi_error; - bio_put(bio); - bio = parent; - } else { - if (bio->bi_end_io) - bio->bi_end_io(bio); - bio = NULL; - } + /* + * Need to have a real endio function for chained bios, otherwise + * various corner cases will break (like stacking block devices that + * save/restore bi_end_io) - however, we want to avoid unbounded + * recursion and blowing the stack. Tail call optimization would + * handle this, but compiling with frame pointers also disables + * gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + bio = __bio_chain_endio(bio); + goto again; } + + if (bio->bi_end_io) + bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5bcdfc10c23a..66e6f1aae02e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, { struct gendisk *disk; struct blkcg_gq *blkg; + struct module *owner; unsigned int major, minor; int key_len, part, ret; char *body; @@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (!disk) return -ENODEV; if (part) { + owner = disk->fops->owner; put_disk(disk); + module_put(owner); return -ENODEV; } @@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ret = PTR_ERR(blkg); rcu_read_unlock(); spin_unlock_irq(disk->queue->queue_lock); + owner = disk->fops->owner; put_disk(disk); + module_put(owner); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { + struct module *owner; + spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); + owner = ctx->disk->fops->owner; put_disk(ctx->disk); + module_put(owner); } EXPORT_SYMBOL_GPL(blkg_conf_finish); @@ -1127,15 +1136,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/block/blk-core.c b/block/blk-core.c index 89eec7965870..b60537b2c35b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,7 +51,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -struct kmem_cache *request_cachep = NULL; +struct kmem_cache *request_cachep; /* * For queue allocation @@ -207,6 +207,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs) EXPORT_SYMBOL(blk_delay_queue); /** + * blk_start_queue_async - asynchronously restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue_async() will clear the stop flag on the queue, and + * ensure that the request_fn for the queue is run from an async + * context. + **/ +void blk_start_queue_async(struct request_queue *q) +{ + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_run_queue_async(q); +} +EXPORT_SYMBOL(blk_start_queue_async); + +/** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * @@ -630,7 +646,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); -int blk_queue_enter(struct request_queue *q, gfp_t gfp) +int blk_queue_enter(struct request_queue *q, bool nowait) { while (true) { int ret; @@ -638,7 +654,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp) if (percpu_ref_tryget_live(&q->q_usage_counter)) return 0; - if (!(gfp & __GFP_WAIT)) + if (nowait) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, @@ -664,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -683,7 +706,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) goto fail_id; q->backing_dev_info.ra_pages = - (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; @@ -809,7 +832,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); -static void blk_queue_bio(struct request_queue *q, struct bio *bio); +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, @@ -825,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; @@ -1206,8 +1230,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1227,7 +1251,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1276,7 +1300,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, + (gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT); else return blk_old_get_request(q, rw, gfp_mask); } @@ -1305,11 +1331,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ @@ -1575,6 +1601,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, @@ -1678,7 +1707,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -static void blk_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1686,8 +1715,6 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) struct request *req; unsigned int request_count = 0; - blk_queue_split(q, &bio, q->bio_split); - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1695,10 +1722,12 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); + blk_queue_split(q, &bio, q->bio_split); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio->bi_error = -EIO; bio_endio(bio); - return; + return BLK_QC_T_NONE; } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { @@ -1713,7 +1742,7 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) */ if (!blk_queue_nomerges(q)) { if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + return BLK_QC_T_NONE; } else request_count = blk_plug_queued_count(q); @@ -1791,6 +1820,8 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } + + return BLK_QC_T_NONE; } /* @@ -1996,12 +2027,13 @@ end_io: * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ -void generic_make_request(struct bio *bio) +blk_qc_t generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) - return; + goto out; /* * We only want one ->make_request_fn to be active at a time, else @@ -2015,7 +2047,7 @@ void generic_make_request(struct bio *bio) */ if (current->bio_list) { bio_list_add(current->bio_list, bio); - return; + goto out; } /* following loop may be a bit non-obvious, and so deserves some @@ -2038,9 +2070,8 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, __GFP_WAIT) == 0)) { - - q->make_request_fn(q, bio); + if (likely(blk_queue_enter(q, false) == 0)) { + ret = q->make_request_fn(q, bio); blk_queue_exit(q); @@ -2053,6 +2084,9 @@ void generic_make_request(struct bio *bio) } } while (bio); current->bio_list = NULL; /* deactivate */ + +out: + return ret; } EXPORT_SYMBOL(generic_make_request); @@ -2066,7 +2100,7 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -void submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; @@ -2100,12 +2134,13 @@ void submit_bio(int rw, struct bio *bio) } } - generic_make_request(bio); + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -2116,20 +2151,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -2149,7 +2177,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -2161,7 +2188,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && @@ -2171,7 +2198,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_insert_request(rq, false, true, true); + blk_mq_insert_request(rq, false, true, false); return 0; } @@ -2428,14 +2455,16 @@ struct request *blk_peek_request(struct request_queue *q) rq = NULL; break; - } else if (ret == BLKPREP_KILL) { + } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { + int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; + rq->cmd_flags |= REQ_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); - __blk_end_request_all(rq, -EIO); + __blk_end_request_all(rq, err); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; @@ -3306,6 +3335,47 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_plug *plug; + long state; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + state = current->state; + while (!need_resched()) { + unsigned int queue_num = blk_qc_t_to_queue_num(cookie); + struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine @@ -3362,6 +3432,9 @@ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; + if (!q->dev) + return ret; + spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; @@ -3389,6 +3462,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend); */ void blk_post_runtime_suspend(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; @@ -3413,6 +3489,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend); */ void blk_pre_runtime_resume(struct request_queue *q) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); @@ -3435,6 +3514,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); */ void blk_post_runtime_resume(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; @@ -3447,6 +3529,30 @@ void blk_post_runtime_resume(struct request_queue *q, int err) spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_post_runtime_resume); + +/** + * blk_set_runtime_active - Force runtime status of the queue to be active + * @q: the queue of the device + * + * If the device is left runtime suspended during system suspend the resume + * hook typically resumes the device and corrects runtime status + * accordingly. However, that does not affect the queue runtime PM status + * which is still "suspended". This prevents processing requests from the + * queue. + * + * This function can be used in driver's resume hook to correct queue + * runtime PM status and re-enable peeking requests from the queue. It + * should be called before first request is added to the queue. + */ +void blk_set_runtime_active(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + spin_unlock_irq(q->queue_lock); +} +EXPORT_SYMBOL(blk_set_runtime_active); #endif int __init blk_dev_init(void) @@ -3463,7 +3569,7 @@ int __init blk_dev_init(void) request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("blkdev_queue", + blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); return 0; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 1a27f45ec776..381cb50a673c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -289,7 +289,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c deleted file mode 100644 index 0736729d6494..000000000000 --- a/block/blk-iopoll.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Functions related to interrupt-poll handling in the block layer. This - * is similar to NAPI for network devices. - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/blk-iopoll.h> -#include <linux/delay.h> - -#include "blk.h" - -static unsigned int blk_iopoll_budget __read_mostly = 256; - -static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); - -/** - * blk_iopoll_sched - Schedule a run of the iopoll handler - * @iop: The parent iopoll structure - * - * Description: - * Add this blk_iopoll structure to the pending poll list and trigger the - * raise of the blk iopoll softirq. The driver must already have gotten a - * successful return from blk_iopoll_sched_prep() before calling this. - **/ -void blk_iopoll_sched(struct blk_iopoll *iop) -{ - unsigned long flags; - - local_irq_save(flags); - list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_iopoll_sched); - -/** - * __blk_iopoll_complete - Mark this @iop as un-polled again - * @iop: The parent iopoll structure - * - * Description: - * See blk_iopoll_complete(). This function must be called with interrupts - * disabled. - **/ -void __blk_iopoll_complete(struct blk_iopoll *iop) -{ - list_del(&iop->list); - smp_mb__before_atomic(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(__blk_iopoll_complete); - -/** - * blk_iopoll_complete - Mark this @iop as un-polled again - * @iop: The parent iopoll structure - * - * Description: - * If a driver consumes less than the assigned budget in its run of the - * iopoll handler, it'll end the polled mode by calling this function. The - * iopoll handler will not be invoked again before blk_iopoll_sched_prep() - * is called. - **/ -void blk_iopoll_complete(struct blk_iopoll *iop) -{ - unsigned long flags; - - local_irq_save(flags); - __blk_iopoll_complete(iop); - local_irq_restore(flags); -} -EXPORT_SYMBOL(blk_iopoll_complete); - -static void blk_iopoll_softirq(struct softirq_action *h) -{ - struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = blk_iopoll_budget; - unsigned long start_time = jiffies; - - local_irq_disable(); - - while (!list_empty(list)) { - struct blk_iopoll *iop; - int work, weight; - - /* - * If softirq window is exhausted then punt. - */ - if (budget <= 0 || time_after(jiffies, start_time)) { - rearm = 1; - break; - } - - local_irq_enable(); - - /* Even though interrupts have been re-enabled, this - * access is safe because interrupts can only add new - * entries to the tail of this list, and only ->poll() - * calls can remove this head entry from the list. - */ - iop = list_entry(list->next, struct blk_iopoll, list); - - weight = iop->weight; - work = 0; - if (test_bit(IOPOLL_F_SCHED, &iop->state)) - work = iop->poll(iop, weight); - - budget -= work; - - local_irq_disable(); - - /* - * Drivers must not modify the iopoll state, if they - * consume their assigned weight (or more, some drivers can't - * easily just stop processing, they have to complete an - * entire mask of commands).In such cases this code - * still "owns" the iopoll instance and therefore can - * move the instance around on the list at-will. - */ - if (work >= weight) { - if (blk_iopoll_disable_pending(iop)) - __blk_iopoll_complete(iop); - else - list_move_tail(&iop->list, list); - } - } - - if (rearm) - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - - local_irq_enable(); -} - -/** - * blk_iopoll_disable - Disable iopoll on this @iop - * @iop: The parent iopoll structure - * - * Description: - * Disable io polling and wait for any pending callbacks to have completed. - **/ -void blk_iopoll_disable(struct blk_iopoll *iop) -{ - set_bit(IOPOLL_F_DISABLE, &iop->state); - while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) - msleep(1); - clear_bit(IOPOLL_F_DISABLE, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_disable); - -/** - * blk_iopoll_enable - Enable iopoll on this @iop - * @iop: The parent iopoll structure - * - * Description: - * Enable iopoll on this @iop. Note that the handler run will not be - * scheduled, it will only mark it as active. - **/ -void blk_iopoll_enable(struct blk_iopoll *iop) -{ - BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); - smp_mb__before_atomic(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_enable); - -/** - * blk_iopoll_init - Initialize this @iop - * @iop: The parent iopoll structure - * @weight: The default weight (or command completion budget) - * @poll_fn: The handler to invoke - * - * Description: - * Initialize this blk_iopoll structure. Before being actively used, the - * driver must call blk_iopoll_enable(). - **/ -void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) -{ - memset(iop, 0, sizeof(*iop)); - INIT_LIST_HEAD(&iop->list); - iop->weight = weight; - iop->poll = poll_fn; - set_bit(IOPOLL_F_SCHED, &iop->state); -} -EXPORT_SYMBOL(blk_iopoll_init); - -static int blk_iopoll_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), - this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block blk_iopoll_cpu_notifier = { - .notifier_call = blk_iopoll_cpu_notify, -}; - -static __init int blk_iopoll_setup(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); - - open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); - register_hotcpu_notifier(&blk_iopoll_cpu_notifier); - return 0; -} -subsys_initcall(blk_iopoll_setup); diff --git a/block/blk-map.c b/block/blk-map.c index f565e11f465a..a54f0543b956 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio) return ret; } +static int __blk_rq_map_user_iov(struct request *rq, + struct rq_map_data *map_data, struct iov_iter *iter, + gfp_t gfp_mask, bool copy) +{ + struct request_queue *q = rq->q; + struct bio *bio, *orig_bio; + int ret; + + if (copy) + bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); + else + bio = bio_map_user_iov(q, iter, gfp_mask); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (map_data && map_data->null_mapped) + bio_set_flag(bio, BIO_NULL_MAPPED); + + iov_iter_advance(iter, bio->bi_iter.bi_size); + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + orig_bio = bio; + blk_queue_bounce(q, &bio); + + /* + * We link the bounce buffer in and could have to traverse it + * later so we have to get a ref to prevent it from being freed + */ + bio_get(bio); + + ret = blk_rq_append_bio(q, rq, bio); + if (ret) { + bio_endio(bio); + __blk_rq_unmap_user(orig_bio); + bio_put(bio); + return ret; + } + + return 0; +} + /** * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage * @q: request queue where request should be inserted @@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - struct bio *bio; - int unaligned = 0; - struct iov_iter i; struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; + bool copy = (q->dma_pad_mask & iter->count) || map_data; + struct bio *bio = NULL; + struct iov_iter i; + int ret; if (!iter || !iter->count) return -EINVAL; @@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, */ if ((uaddr & queue_dma_alignment(q)) || iovec_gap_to_prv(q, &prv, &iov)) - unaligned = 1; + copy = true; prv.iov_base = iov.iov_base; prv.iov_len = iov.iov_len; } - if (unaligned || (q->dma_pad_mask & iter->count) || map_data) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); - - if (IS_ERR(bio)) - return PTR_ERR(bio); - - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - - if (bio->bi_iter.bi_size != iter->count) { - /* - * Grab an extra reference to this bio, as bio_unmap_user() - * expects to be able to drop it twice as it happens on the - * normal IO completion path - */ - bio_get(bio); - bio_endio(bio); - __blk_rq_unmap_user(bio); - return -EINVAL; - } + i = *iter; + do { + ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (ret) + goto unmap_rq; + if (!bio) + bio = rq->bio; + } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - - blk_queue_bounce(q, &bio); - bio_get(bio); - blk_rq_bio_prep(q, rq, bio); return 0; + +unmap_rq: + __blk_rq_unmap_user(bio); + rq->bio = NULL; + return -EINVAL; } EXPORT_SYMBOL(blk_rq_map_user_iov); diff --git a/block/blk-merge.c b/block/blk-merge.c index de5716d8e525..261353166dcf 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,6 +7,8 @@ #include <linux/blkdev.h> #include <linux/scatterlist.h> +#include <trace/events/block.h> + #include "blk.h" static struct bio *blk_bio_discard_split(struct request_queue *q, @@ -68,6 +70,18 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); } +static inline unsigned get_max_io_size(struct request_queue *q, + struct bio *bio) +{ + unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + unsigned mask = queue_logical_block_size(q) - 1; + + /* aligned to logical block size */ + sectors &= ~(mask >> 9); + + return sectors; +} + static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, @@ -76,11 +90,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned seg_size = 0, nsegs = 0, sectors = 0; + unsigned front_seg_size = bio->bi_seg_front_size; + bool do_split = true; + struct bio *new = NULL; + const unsigned max_sectors = get_max_io_size(q, bio); bio_for_each_segment(bv, bio, iter) { - if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) - goto split; - /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -88,6 +103,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) goto split; + if (sectors + (bv.bv_len >> 9) > max_sectors) { + /* + * Consider this a new segment if we're splitting in + * the middle of this vector. + */ + if (nsegs < queue_max_segments(q) && + sectors < max_sectors) { + nsegs++; + sectors = max_sectors; + } + if (sectors) + goto split; + /* Make this single bvec as the 1st segment */ + } + if (bvprvp && blk_queue_cluster(q)) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; @@ -98,8 +128,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, seg_size += bv.bv_len; bvprv = bv; - bvprvp = &bv; + bvprvp = &bvprv; sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; continue; } new_segment: @@ -108,16 +141,29 @@ new_segment: nsegs++; bvprv = bv; - bvprvp = &bv; + bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; } - *segs = nsegs; - return NULL; + do_split = false; split: *segs = nsegs; - return bio_split(bio, sectors, GFP_NOIO, bs); + + if (do_split) { + new = bio_split(bio, sectors, GFP_NOIO, bs); + if (new) + bio = new; + } + + bio->bi_seg_front_size = front_seg_size; + if (seg_size > bio->bi_seg_back_size) + bio->bi_seg_back_size = seg_size; + + return do_split ? new : NULL; } void blk_queue_split(struct request_queue *q, struct bio **bio, @@ -143,6 +189,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, split->bi_rw |= REQ_NOMERGE; bio_chain(split, *bio); + trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); *bio = split; } @@ -257,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { struct bio_vec end_bv = { NULL }, nxt_bv; - struct bvec_iter iter; if (!blk_queue_cluster(q)) return 0; @@ -269,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!bio_has_data(bio)) return 1; - bio_for_each_segment(end_bv, bio, iter) - if (end_bv.bv_len == iter.bi_size) - break; - - nxt_bv = bio_iovec(nxt); + bio_get_last_bvec(bio, &end_bv); + bio_get_first_bvec(nxt, &nxt_bv); if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) return 0; @@ -412,6 +455,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (sg) sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > rq->nr_phys_segments); + return nsegs; } EXPORT_SYMBOL(blk_rq_map_sg); diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 8764c241e5bb..d0634bcf322f 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -113,7 +113,7 @@ int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) for_each_possible_cpu(i) { if (index == mq_map[i]) - return cpu_to_node(i); + return local_memory_node(cpu_to_node(i)); } return NUMA_NO_NODE; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 6f57a110289c..4ea4dd8a1eed 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -174,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) return ret; } +static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); +} + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -295,6 +300,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { + .attr = {.name = "io_poll", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_poll_show, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -304,6 +313,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, + &blk_mq_hw_sysfs_poll.attr, NULL, }; @@ -398,19 +408,22 @@ void blk_mq_unregister_disk(struct gendisk *disk) blk_mq_enable_hotplug(); } +void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) +{ + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); +} + static void blk_mq_sysfs_init(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; - int i; + int cpu; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) - kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - - queue_for_each_ctx(q, ctx, i) + for_each_possible_cpu(cpu) { + ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); + } } int blk_mq_register_disk(struct gendisk *disk) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 60ac684c8b8c..abdbb47405cb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (data->flags & BLK_MQ_REQ_NOWAIT) return -1; bs = bt_wait_ptr(bt, hctx); @@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data, data->ctx = blk_mq_get_ctx(data->q); data->hctx = data->q->mq_ops->map_queue(data->q, data->ctx->cpu); - if (data->reserved) { + if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { last_tag = &data->ctx->last_tag; @@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { - if (!data->reserved) - return __blk_mq_get_tag(data); - - return __blk_mq_get_reserved_tag(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + return __blk_mq_get_reserved_tag(data); + return __blk_mq_get_tag(data); } static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c27b3eaef64..1699baf39b78 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) return NULL; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, - bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, + unsigned int flags) { struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; @@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_queue_enter(q, gfp); + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, - reserved, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, - hctx); + blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; } @@ -358,7 +356,7 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } -void __blk_mq_complete_request(struct request *rq) +static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; @@ -546,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - return tags->rqs[tag]; + if (tag < tags->nr_tags) + return tags->rqs[tag]; + + return NULL; } EXPORT_SYMBOL(blk_mq_tag_to_rq); @@ -601,12 +602,12 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) - blk_mq_complete_request(rq, -EIO); + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } return; } - if (rq->cmd_flags & REQ_NO_TIMEOUT) - return; if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) @@ -617,15 +618,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, } } -static void blk_mq_rq_timer(unsigned long priv) +static void blk_mq_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *)priv; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, }; int i; + if (blk_queue_enter(q, true)) + return; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { @@ -640,6 +645,7 @@ static void blk_mq_rq_timer(unsigned long priv) blk_mq_tag_idle(hctx); } } + blk_queue_exit(q); } /* @@ -1175,8 +1181,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, rw |= REQ_SYNC; trace_block_getrq(q, bio, rw); - blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, - hctx); + blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); if (unlikely(!rq)) { __blk_mq_run_hw_queue(hctx); @@ -1185,8 +1190,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; @@ -1198,7 +1202,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } -static int blk_mq_direct_issue_request(struct request *rq) +static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; @@ -1209,6 +1213,7 @@ static int blk_mq_direct_issue_request(struct request *rq) .list = NULL, .last = 1 }; + blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); /* * For OK queue, we are done. For error, kill it. Any other @@ -1216,18 +1221,21 @@ static int blk_mq_direct_issue_request(struct request *rq) * would have done */ ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) + if (ret == BLK_MQ_RQ_QUEUE_OK) { + *cookie = new_cookie; return 0; - else { - __blk_mq_requeue_request(rq); + } - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - return 0; - } - return -1; + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + *cookie = BLK_QC_T_NONE; + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; } + + return -1; } /* @@ -1235,7 +1243,7 @@ static int blk_mq_direct_issue_request(struct request *rq) * but will attempt to bypass the hctx queueing if we can go straight to * hardware for SYNC IO. */ -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); @@ -1244,12 +1252,13 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } blk_queue_split(q, &bio, q->bio_split); @@ -1257,13 +1266,15 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) if (!is_flush_fua && !blk_queue_nomerges(q)) { if (blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) - return; + return BLK_QC_T_NONE; } else request_count = blk_plug_queued_count(q); rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1284,15 +1295,16 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); /* - * we do limited pluging. If bio can be merged, do merge. + * We do limited pluging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most */ if (plug) { /* * The plug list might get flushed before this. If that - * happens, same_queue_rq is invalid and plug list is empty - **/ + * happens, same_queue_rq is invalid and plug list is + * empty + */ if (same_queue_rq && !list_empty(&plug->mq_list)) { old_rq = same_queue_rq; list_del_init(&old_rq->queuelist); @@ -1302,11 +1314,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) old_rq = rq; blk_mq_put_ctx(data.ctx); if (!old_rq) - return; - if (!blk_mq_direct_issue_request(old_rq)) - return; + goto done; + if (!blk_mq_direct_issue_request(old_rq, &cookie)) + goto done; blk_mq_insert_request(old_rq, false, true, true); - return; + goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1320,13 +1332,15 @@ run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } blk_mq_put_ctx(data.ctx); +done: + return cookie; } /* * Single hardware queue variant. This will attempt to use any per-process * plug for merging and IO deferral. */ -static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); @@ -1334,23 +1348,26 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { bio_io_error(bio); - return; + return BLK_QC_T_NONE; } blk_queue_split(q, &bio, q->bio_split); if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return; + return BLK_QC_T_NONE; rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1368,13 +1385,16 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); if (!request_count) trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { + + blk_mq_put_ctx(data.ctx); + + if (request_count >= BLK_MAX_REQUEST_COUNT) { blk_flush_plug_list(plug, false); trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + return cookie; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1389,6 +1409,7 @@ run_queue: } blk_mq_put_ctx(data.ctx); + return cookie; } /* @@ -1726,31 +1747,6 @@ static int blk_mq_init_hctx(struct request_queue *q, return -1; } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int i; - - /* - * Initialize hardware queues - */ - queue_for_each_hw_ctx(q, hctx, i) { - if (blk_mq_init_hctx(q, set, hctx, i)) - break; - } - - if (i == q->nr_hw_queues) - return 0; - - /* - * Init failed - */ - blk_mq_exit_hw_queues(q, set, i); - - return 1; -} - static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { @@ -1777,7 +1773,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, * not, we remain on the home node of the device */ if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = cpu_to_node(i); + hctx->numa_node = local_memory_node(cpu_to_node(i)); } } @@ -1802,12 +1798,14 @@ static void blk_mq_map_swqueue(struct request_queue *q, /* * Map software to hardware queues */ - queue_for_each_ctx(q, ctx, i) { + for_each_possible_cpu(i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpumask_test_cpu(i, online_mask)) continue; + ctx = per_cpu_ptr(q->queue_ctx, i); hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; @@ -1837,6 +1835,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); + cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping @@ -1850,14 +1849,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } - - queue_for_each_ctx(q, ctx, i) { - if (!cpumask_test_cpu(i, online_mask)) - continue; - - hctx = q->mq_ops->map_queue(q, i); - cpumask_set_cpu(i, hctx->tags->cpumask); - } } static void queue_set_hctx_shared(struct request_queue *q, bool shared) @@ -1963,56 +1954,93 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx __percpu *ctx; - unsigned int *map; - int i; - - ctx = alloc_percpu(struct blk_mq_ctx); - if (!ctx) - return ERR_PTR(-ENOMEM); - - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - set->numa_node); - - if (!hctxs) - goto err_percpu; - - map = blk_mq_make_queue_map(set); - if (!map) - goto err_map; + int i, j; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + blk_mq_sysfs_unregister(q); for (i = 0; i < set->nr_hw_queues; i++) { - int node = blk_mq_hw_queue_to_node(map, i); + int node; + if (hctxs[i]) + continue; + + node = blk_mq_hw_queue_to_node(q->mq_map, i); hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); if (!hctxs[i]) - goto err_hctxs; + break; if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, - node)) - goto err_hctxs; + node)) { + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } atomic_set(&hctxs[i]->nr_active, 0); hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; + + if (blk_mq_init_hctx(q, set, hctxs[i], i)) { + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } + blk_mq_hctx_kobj_init(hctxs[i]); + } + for (j = i; j < q->nr_hw_queues; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + if (hctx->tags) { + blk_mq_free_rq_map(set, hctx->tags, j); + set->tags[j] = NULL; + } + blk_mq_exit_hctx(q, set, hctx, j); + free_cpumask_var(hctx->cpumask); + kobject_put(&hctx->kobj); + kfree(hctx->ctxs); + kfree(hctx); + hctxs[j] = NULL; + + } } + q->nr_hw_queues = i; + blk_mq_sysfs_register(q); +} + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + /* mark the queue as mq asap */ + q->mq_ops = set->ops; + + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!q->queue_ctx) + return ERR_PTR(-ENOMEM); + + q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), + GFP_KERNEL, set->numa_node); + if (!q->queue_hw_ctx) + goto err_percpu; + + q->mq_map = blk_mq_make_queue_map(set); + if (!q->mq_map) + goto err_map; - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + blk_mq_realloc_hw_ctxs(set, q); + if (!q->nr_hw_queues) + goto err_hctxs; + + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = set->nr_hw_queues; - q->mq_map = map; - - q->queue_ctx = ctx; - q->queue_hw_ctx = hctxs; - q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; if (!(set->flags & BLK_MQ_F_SG_MERGE)) @@ -2039,9 +2067,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); - if (blk_mq_init_hw_queues(q, set)) - goto err_hctxs; - get_online_cpus(); mutex_lock(&all_q_mutex); @@ -2055,17 +2080,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(map); - for (i = 0; i < set->nr_hw_queues; i++) { - if (!hctxs[i]) - break; - free_cpumask_var(hctxs[i]->cpumask); - kfree(hctxs[i]); - } + kfree(q->mq_map); err_map: - kfree(hctxs); + kfree(q->queue_hw_ctx); err_percpu: - free_percpu(ctx); + free_percpu(q->queue_ctx); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -2273,9 +2292,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->nr_hw_queues = 1; set->queue_depth = min(64U, set->queue_depth); } + /* + * There is no use for more h/w queues than cpus. + */ + if (set->nr_hw_queues > nr_cpu_ids) + set->nr_hw_queues = nr_cpu_ids; - set->tags = kmalloc_node(set->nr_hw_queues * - sizeof(struct blk_mq_tags *), + set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; @@ -2298,7 +2321,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (set->tags[i]) blk_mq_free_rq_map(set, set->tags[i], i); } @@ -2319,6 +2342,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) ret = 0; queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; ret = blk_mq_tag_update_depth(hctx->tags, nr); if (ret) break; @@ -2330,6 +2355,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +{ + struct request_queue *q; + + if (nr_hw_queues > nr_cpu_ids) + nr_hw_queues = nr_cpu_ids; + if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) + return; + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue(q); + + set->nr_hw_queues = nr_hw_queues; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_realloc_hw_ctxs(set, q); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_mq_queue_reinit(q, cpu_online_mask); + } + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/block/blk-mq.h b/block/blk-mq.h index b44dce165761..9087b11037b7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -25,7 +25,6 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); @@ -58,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); */ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); +extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); extern void blk_mq_rq_timed_out(struct request *req, bool reserved); @@ -97,8 +97,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; - gfp_t gfp; - bool reserved; + unsigned int flags; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -106,13 +105,11 @@ struct blk_mq_alloc_data { }; static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, - struct request_queue *q, gfp_t gfp, bool reserved, - struct blk_mq_ctx *ctx, - struct blk_mq_hw_ctx *hctx) + struct request_queue *q, unsigned int flags, + struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx) { data->q = q; - data->gfp = gfp; - data->reserved = reserved; + data->flags = flags; data->ctx = ctx; data->hctx = hctx; } diff --git a/block/blk-settings.c b/block/blk-settings.c index 7d8f129a1516..331e4eee0dda 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -92,6 +92,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; @@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request - * @limits: the queue limits + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -224,36 +226,29 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * the device driver based upon the capabilities of the I/O * controller. * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. + * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block/<device>/queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { - if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { - max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + + if ((max_hw_sectors << 9) < PAGE_SIZE) { + max_hw_sectors = 1 << (PAGE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", __func__, max_hw_sectors); } limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); -} -EXPORT_SYMBOL(blk_limits_max_hw_sectors); - -/** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit - * - * Description: - * See description for blk_limits_max_hw_sectors(). - **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) -{ - blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -334,8 +329,8 @@ EXPORT_SYMBOL(blk_queue_max_segments); **/ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) { - if (max_size < PAGE_CACHE_SIZE) { - max_size = PAGE_CACHE_SIZE; + if (max_size < PAGE_SIZE) { + max_size = PAGE_SIZE; printk(KERN_INFO "%s: set to minimum %d\n", __func__, max_size); } @@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); @@ -764,8 +760,8 @@ EXPORT_SYMBOL_GPL(blk_queue_dma_drain); **/ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) { - if (mask < PAGE_CACHE_SIZE - 1) { - mask = PAGE_CACHE_SIZE - 1; + if (mask < PAGE_SIZE - 1) { + mask = PAGE_SIZE - 1; printk(KERN_INFO "%s: set to minimum %lx\n", __func__, mask); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 31849e328b45..995b58d46ed1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -76,7 +76,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { unsigned long ra_kb = q->backing_dev_info.ra_pages << - (PAGE_CACHE_SHIFT - 10); + (PAGE_SHIFT - 10); return queue_var_show(ra_kb, (page)); } @@ -90,7 +90,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); + q->backing_dev_info.ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -117,7 +117,7 @@ static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) if (blk_queue_cluster(q)) return queue_var_show(queue_max_segment_size(q), (page)); - return queue_var_show(PAGE_CACHE_SIZE, (page)); + return queue_var_show(PAGE_SIZE, (page)); } static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) @@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) { - unsigned long long val; - val = q->limits.max_hw_discard_sectors << 9; - return sprintf(page, "%llu\n", val); + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_hw_discard_sectors << 9); } static ssize_t queue_discard_max_show(struct request_queue *q, char *page) @@ -199,12 +198,15 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { unsigned long max_sectors_kb, max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, - page_kb = 1 << (PAGE_CACHE_SHIFT - 10); + page_kb = 1 << (PAGE_SHIFT - 10); ssize_t ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; @@ -317,6 +319,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); +} + +static ssize_t queue_poll_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_on; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_on, page, count); + if (ret < 0) + return ret; + + spin_lock_irq(q->queue_lock); + if (poll_on) + queue_flag_set(QUEUE_FLAG_POLL, q); + else + queue_flag_clear(QUEUE_FLAG_POLL, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -442,6 +472,12 @@ static struct queue_sysfs_entry queue_random_entry = { .store = queue_store_random, }; +static struct queue_sysfs_entry queue_poll_entry = { + .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_show, + .store = queue_poll_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -466,6 +502,7 @@ static struct attribute *default_attrs[] = { &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_random_entry.attr, + &queue_poll_entry.attr, NULL, }; diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 246dfb16c3d9..a30441a200c0 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout } } -void blk_rq_timed_out_timer(unsigned long data) +void blk_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); unsigned long flags, next = 0; struct request *rq, *tmp; int next_set = 0; + if (blk_queue_enter(q, true)) + return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_exit(q); } /** @@ -158,11 +162,13 @@ void blk_abort_request(struct request *req) { if (blk_mark_rq_complete(req)) return; - blk_delete_timer(req); - if (req->q->mq_ops) + + if (req->q->mq_ops) { blk_mq_rq_timed_out(req, false); - else + } else { + blk_delete_timer(req); blk_rq_timed_out(req); + } } EXPORT_SYMBOL_GPL(blk_abort_request); @@ -184,15 +190,13 @@ unsigned long blk_rq_timeout(unsigned long timeout) * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. + * Queue lock must be held for the non-mq case, mq case doesn't care. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; - if (req->cmd_flags & REQ_NO_TIMEOUT) - return; - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; @@ -207,6 +211,11 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->deadline = jiffies + req->timeout; + + /* + * Only the non-mq case needs to add the request to a protected list. + * For the mq case we simply scan the tag map. + */ if (!q->mq_ops) list_add_tail(&req->timeout_list, &req->q->timeout_list); diff --git a/block/blk.h b/block/blk.h index da722eb786df..70e4aee9cdcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -72,8 +72,6 @@ void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); -int blk_queue_enter(struct request_queue *q, gfp_t gfp); -void blk_queue_exit(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); static inline void blk_queue_enter_live(struct request_queue *q) @@ -95,7 +93,7 @@ static inline void blk_flush_integrity(void) } #endif -void blk_rq_timed_out_timer(unsigned long data); +void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1f9093e901da..4a349787bc62 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) return pblkg ? blkg_to_cfqg(pblkg) : NULL; } +static inline bool cfqg_is_descendant(struct cfq_group *cfqg, + struct cfq_group *ancestor) +{ + return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, + cfqg_to_blkg(ancestor)->blkcg->css.cgroup); +} + static inline void cfqg_get(struct cfq_group *cfqg) { return blkg_get(cfqg_to_blkg(cfqg)); @@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) #else /* CONFIG_CFQ_GROUP_IOSCHED */ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } +static inline bool cfqg_is_descendant(struct cfq_group *cfqg, + struct cfq_group *ancestor) +{ + return true; +} static inline void cfqg_get(struct cfq_group *cfqg) { } static inline void cfqg_put(struct cfq_group *cfqg) { } @@ -2897,6 +2909,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; + struct cfq_rb_root *st = cfqq->service_tree; struct cfq_io_cq *cic; unsigned long sl, group_idle = 0; @@ -2947,8 +2960,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; } - /* There are other queues in the group, don't do group idle */ - if (group_idle && cfqq->cfqg->nr_cfqq > 1) + /* + * There are other queues in the group or this is the only group and + * it has too big thinktime, don't do group idle. + */ + if (group_idle && + (cfqq->cfqg->nr_cfqq > 1 || + cfq_io_thinktime_big(cfqd, &st->ttime, true))) return; cfq_mark_cfqq_wait_request(cfqq); @@ -3947,16 +3965,27 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; - if (new_cfqq->cfqg != cfqq->cfqg) + /* + * Treat ancestors of current cgroup the same way as current cgroup. + * For anybody else we disallow preemption to guarantee service + * fairness among cgroups. + */ + if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) return false; if (cfq_slice_used(cfqq)) return true; + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) + return true; + + WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); /* Allow preemption only if we are idling on sync-noidle tree */ if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - new_cfqq->service_tree->count == 2 && RB_EMPTY_ROOT(&cfqq->sort_list)) return true; @@ -3967,12 +3996,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) return true; - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return true; - /* An idle queue should not be idle now for some reason */ if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) return true; @@ -4052,7 +4075,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, * idle timer unplug to continue working. */ if (cfq_cfqq_wait_request(cfqq)) { - if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || + if (blk_rq_bytes(rq) > PAGE_SIZE || cfqd->busy_queues > 1) { cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f678c733df40..556826ac7cb4 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -710,7 +710,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); return compat_put_long(arg, - (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); + (bdi->ra_pages * PAGE_SIZE) / 512); case BLKROGET: /* compatible */ return compat_put_int(arg, bdev_read_only(bdev) != 0); case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -729,7 +729,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; + bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKGETSIZE: size = i_size_read(bdev->bd_inode); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index a753df2b3fc2..d0dd7882d8c7 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -39,7 +39,6 @@ struct deadline_data { */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ - sector_t last_sector; /* head position */ unsigned int starved; /* times reads have starved writes */ /* @@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) dd->next_rq[WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); - dd->last_sector = rq_end_sector(rq); - /* * take it off the sort and fifo list, move * to dispatch queue diff --git a/block/genhd.c b/block/genhd.c index e5cafa51567c..9f42526b4d62 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> +#include <linux/badblocks.h> #include "blk.h" @@ -664,7 +665,6 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); - disk->driverfs_dev = NULL; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -672,6 +672,31 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return sprintf(page, "\n"); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return -ENXIO; + + return badblocks_store(disk->bb, page, len, 0); +} + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -990,6 +1015,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, + disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -1011,6 +1038,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1421,7 +1449,7 @@ static DEFINE_MUTEX(disk_events_mutex); static LIST_HEAD(disk_events); /* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs = 0; +static unsigned long disk_events_dfl_poll_msecs; static unsigned long disk_events_poll_jiffies(struct gendisk *disk) { diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed2d847..4ff1f92f89ca 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -4,6 +4,7 @@ #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> +#include <linux/badblocks.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> @@ -406,6 +407,35 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } +#ifdef CONFIG_FS_DAX +bool blkdev_dax_capable(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + + if (!disk->fops->direct_access) + return false; + + /* + * If the partition is not aligned on a page boundary, we can't + * do dax I/O to it. + */ + if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) + || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + return false; + + /* + * If the device has known bad blocks, force all I/O through the + * driver / page cache. + * + * TODO: support finer grained dax error handling + */ + if (disk->bb && disk->bb->count) + return false; + + return true; +} +#endif + static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -520,7 +550,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (!arg) return -EINVAL; bdi = blk_get_backing_dev_info(bdev); - return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); + return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -548,7 +578,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdi = blk_get_backing_dev_info(bdev); - bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; + bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); @@ -568,6 +598,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESETUP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); + case BLKDAXGET: + return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); + break; case IOC_PR_REGISTER: return blkdev_pr_register(bdev, argp); case IOC_PR_RESERVE: diff --git a/block/ioprio.c b/block/ioprio.c index 31666c92b46a..cc7800e9eb44 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3de89d4690f3..a163c487cf38 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { struct noop_data *nd = q->elevator->elevator_data; + struct request *rq; - if (!list_empty(&nd->queue)) { - struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); + if (rq) { list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq) if (rq->queuelist.prev == &nd->queue) return NULL; - return list_entry(rq->queuelist.prev, struct request, queuelist); + return list_prev_entry(rq, queuelist); } static struct request * @@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq) if (rq->queuelist.next == &nd->queue) return NULL; - return list_entry(rq->queuelist.next, struct request, queuelist); + return list_next_entry(rq, queuelist); } static int noop_init_queue(struct request_queue *q, struct elevator_type *e) diff --git a/block/partition-generic.c b/block/partition-generic.c index 3b030157ec85..d7eb77e1e3a8 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -16,6 +16,7 @@ #include <linux/kmod.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/dax.h> #include <linux/blktrace_api.h> #include "partitions/check.h" @@ -216,10 +217,21 @@ static void part_release(struct device *dev) kfree(p); } +static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct hd_struct *part = dev_to_part(dev); + + add_uevent_var(env, "PARTN=%u", part->partno); + if (part->info && part->info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->info->volname); + return 0; +} + struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, + .uevent = part_uevent, }; static void delete_partition_rcu_cb(struct rcu_head *head) @@ -349,15 +361,20 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } + err = hd_ref_init(p); + if (err) { + if (flags & ADDPART_FLAG_WHOLEDISK) + goto out_remove_file; + goto out_del; + } + /* everything is up and running, commence */ rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - - if (!hd_ref_init(p)) - return p; + return p; out_free_info: free_part_info(p); @@ -366,6 +383,8 @@ out_free_stats: out_free: kfree(p); return ERR_PTR(err); +out_remove_file: + device_remove_file(pdev, &dev_attr_whole_disk); out_del: kobject_put(p->holder_dir); device_del(pdev); @@ -397,7 +416,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) struct hd_struct *part; int res; - if (bdev->bd_part_count) + if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); if (res) @@ -550,20 +569,31 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } -unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n) { struct address_space *mapping = bdev->bd_inode->i_mapping; - struct page *page; - page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), + return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL); +} + +unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) +{ + struct page *page; + + /* don't populate page cache for dax capable devices */ + if (IS_DAX(bdev->bd_inode)) + page = read_dax_sector(bdev, n); + else + page = read_pagecache_sector(bdev, n); + if (!IS_ERR(page)) { if (PageError(page)) goto fail; p->v = page; - return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9); + return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9); fail: - page_cache_release(page); + put_page(page); } p->v = NULL; return NULL; diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c2c48ec64b27..621317ac4d59 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state) Sector sect; unsigned char *data; int slot, blocks_in_map; - unsigned secsize; + unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; @@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); + datasize = round_down(secsize, 512); + data = read_part_sector(state, datasize / 512, §); if (!data) return -1; - part = (struct mac_partition *) (data + secsize%512); + partoffset = secsize % 512; + if (partoffset + sizeof(*part) > datasize) + return -1; + part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index dda653ce7b24..0774799942e0 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; goto error; } @@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); |