From 7e026c8c0a4200da86bc51edeaad79dcdccf78ca Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 May 2017 12:57:56 -0700 Subject: dm: add ->copy_from_iter() dax operation support Allow device-mapper to route copy_from_iter operations to the per-target implementation. In order for the device stacking to work we need a dax_dev and a pgoff relative to that device. This gives each layer of the stack the information it needs to look up the operation pointer for the next level. This conceptually allows for an array of mixed device drivers with varying copy_from_iter implementations. Reviewed-by: Toshi Kani Reviewed-by: Mike Snitzer Signed-off-by: Dan Williams --- drivers/dax/super.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/dax') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 6ed32aac8bbe..dd299e55f65d 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -172,6 +173,18 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, } EXPORT_SYMBOL_GPL(dax_direct_access); +size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + if (!dax_alive(dax_dev)) + return 0; + + if (!dax_dev->ops->copy_from_iter) + return copy_from_iter(addr, bytes, i); + return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); +} +EXPORT_SYMBOL_GPL(dax_copy_from_iter); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); -- cgit 1.4.1 From abebfbe2f7315dd3ec9a0c69596a76e32beb5749 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 29 May 2017 13:02:52 -0700 Subject: dm: add ->flush() dax operation support Allow device-mapper to route flush operations to the per-target implementation. In order for the device stacking to work we need a dax_dev and a pgoff relative to that device. This gives each layer of the stack the information it needs to look up the operation pointer for the next level. This conceptually allows for an array of mixed device drivers with varying flush implementations. Reviewed-by: Toshi Kani Reviewed-by: Mike Snitzer Signed-off-by: Dan Williams --- drivers/dax/super.c | 11 +++++++++++ drivers/md/dm-linear.c | 15 +++++++++++++++ drivers/md/dm-stripe.c | 20 ++++++++++++++++++++ drivers/md/dm.c | 19 +++++++++++++++++++ include/linux/dax.h | 2 ++ include/linux/device-mapper.h | 3 +++ 6 files changed, 70 insertions(+) (limited to 'drivers/dax') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index dd299e55f65d..b7729e4d351a 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -185,6 +185,17 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, } EXPORT_SYMBOL_GPL(dax_copy_from_iter); +void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t size) +{ + if (!dax_alive(dax_dev)) + return; + + if (dax_dev->ops->flush) + dax_dev->ops->flush(dax_dev, pgoff, addr, size); +} +EXPORT_SYMBOL_GPL(dax_flush); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 0841ec1bfbad..25e661974319 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -173,6 +173,20 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr, + size_t size) +{ + struct linear_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct dax_device *dax_dev = lc->dev->dax_dev; + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + + dev_sector = linear_map_sector(ti, sector); + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff)) + return; + dax_flush(dax_dev, pgoff, addr, size); +} + static struct target_type linear_target = { .name = "linear", .version = {1, 3, 0}, @@ -186,6 +200,7 @@ static struct target_type linear_target = { .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_copy_from_iter = linear_dax_copy_from_iter, + .dax_flush = linear_dax_flush, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 1ef914f9ca72..8e73517967b6 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -351,6 +351,25 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); } +static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr, + size_t size) +{ + sector_t dev_sector, sector = pgoff * PAGE_SECTORS; + struct stripe_c *sc = ti->private; + struct dax_device *dax_dev; + struct block_device *bdev; + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, &dev_sector); + dev_sector += sc->stripe[stripe].physical_start; + dax_dev = sc->stripe[stripe].dev->dax_dev; + bdev = sc->stripe[stripe].dev->bdev; + + if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff)) + return; + dax_flush(dax_dev, pgoff, addr, size); +} + /* * Stripe status: * @@ -471,6 +490,7 @@ static struct target_type stripe_target = { .io_hints = stripe_io_hints, .direct_access = stripe_dax_direct_access, .dax_copy_from_iter = stripe_dax_copy_from_iter, + .dax_flush = stripe_dax_flush, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7faaceb52819..09b3efdc8abf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -994,6 +994,24 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t size) +{ + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + + if (!ti) + goto out; + if (ti->type->dax_flush) + ti->type->dax_flush(ti, pgoff, addr, size); + out: + dm_put_live_table(md, srcu_idx); +} + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH. @@ -2885,6 +2903,7 @@ static const struct block_device_operations dm_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .copy_from_iter = dm_dax_copy_from_iter, + .flush = dm_dax_flush, }; /* diff --git a/include/linux/dax.h b/include/linux/dax.h index 407dd3ff6e54..1f6b6072af64 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -82,6 +82,8 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, + size_t size); /* * We use lowest available bit in exceptional entry for locking, one bit for diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 11c8a0a92f9c..67bfe8ddcb32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -134,6 +134,8 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +typedef void (*dm_dax_flush_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, + size_t size); #define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); @@ -184,6 +186,7 @@ struct target_type { dm_io_hints_fn io_hints; dm_dax_direct_access_fn direct_access; dm_dax_copy_from_iter_fn dax_copy_from_iter; + dm_dax_flush_fn dax_flush; /* For internal device-mapper use. */ struct list_head list; -- cgit 1.4.1 From 5d61e43b3975c0582003329d9de9d5e85abf5d33 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 27 Jun 2017 13:06:22 -0700 Subject: dax: remove default copy_from_iter fallback Require all dax-drivers to register a ->copy_from_iter() operation so that it is clear which dax_operations are optional and which must be implemented for filesystem-dax to operate. Cc: Gerald Schaefer Suggested-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/powerpc/sysdev/axonram.c | 8 ++++++++ drivers/block/brd.c | 8 ++++++++ drivers/dax/super.c | 2 -- drivers/s390/block/dcssblk.c | 8 ++++++++ include/linux/dax.h | 2 +- 5 files changed, 25 insertions(+), 3 deletions(-) (limited to 'drivers/dax') diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index a7fe5fee744f..2799706106c6 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -163,8 +164,15 @@ axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pa return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn); } +static size_t axon_ram_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + static const struct dax_operations axon_ram_dax_ops = { .direct_access = axon_ram_dax_direct_access, + .copy_from_iter = axon_ram_copy_from_iter, }; /** diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 57b574f2f66a..f2a7ac350f6a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -22,6 +22,7 @@ #ifdef CONFIG_BLK_DEV_RAM_DAX #include #include +#include #endif #include @@ -354,8 +355,15 @@ static long brd_dax_direct_access(struct dax_device *dax_dev, return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn); } +static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + static const struct dax_operations brd_dax_ops = { .direct_access = brd_dax_direct_access, + .copy_from_iter = brd_dax_copy_from_iter, }; #endif diff --git a/drivers/dax/super.c b/drivers/dax/super.c index b7729e4d351a..9e0160b950d7 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -179,8 +179,6 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, if (!dax_alive(dax_dev)) return 0; - if (!dax_dev->ops->copy_from_iter) - return copy_from_iter(addr, bytes, i); return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); } EXPORT_SYMBOL_GPL(dax_copy_from_iter); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 36e5280af3e4..88fa7b3f7a9d 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -43,8 +44,15 @@ static const struct block_device_operations dcssblk_devops = { .release = dcssblk_release, }; +static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + static const struct dax_operations dcssblk_dax_ops = { .direct_access = dcssblk_dax_direct_access, + .copy_from_iter = dcssblk_dax_copy_from_iter, }; struct dcssblk_dev_info { diff --git a/include/linux/dax.h b/include/linux/dax.h index 1f6b6072af64..73fca1bebaf3 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -16,7 +16,7 @@ struct dax_operations { */ long (*direct_access)(struct dax_device *, pgoff_t, long, void **, pfn_t *); - /* copy_from_iter: dax-driver override for default copy_from_iter */ + /* copy_from_iter: required operation for fs-dax direct-i/o */ size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); /* flush: optional driver-specific cache management after writes */ -- cgit 1.4.1 From 9a60c3ef577beb0376704808949f2c1f8fb0672c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 27 Jun 2017 17:59:28 -0700 Subject: dax: convert to bitmask for flags In preparation for adding more flags, convert the existing flag to a bit-flag. Signed-off-by: Dan Williams --- drivers/dax/super.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/dax') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 9e0160b950d7..8bf71195921b 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -116,13 +116,18 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) EXPORT_SYMBOL_GPL(__bdev_dax_supported); #endif +enum dax_device_flags { + /* !alive + rcu grace period == no new operations / mappings */ + DAXDEV_ALIVE, +}; + /** * struct dax_device - anchor object for dax services * @inode: core vfs * @cdev: optional character interface for "device dax" * @host: optional name for lookups where the device path is not available * @private: dax driver private data - * @alive: !alive + rcu grace period == no new operations / mappings + * @flags: state and boolean properties */ struct dax_device { struct hlist_node list; @@ -130,7 +135,7 @@ struct dax_device { struct cdev cdev; const char *host; void *private; - bool alive; + unsigned long flags; const struct dax_operations *ops; }; @@ -197,7 +202,7 @@ EXPORT_SYMBOL_GPL(dax_flush); bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); - return dax_dev->alive; + return test_bit(DAXDEV_ALIVE, &dax_dev->flags); } EXPORT_SYMBOL_GPL(dax_alive); @@ -217,7 +222,7 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; - dax_dev->alive = false; + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); @@ -257,7 +262,7 @@ static void dax_destroy_inode(struct inode *inode) { struct dax_device *dax_dev = to_dax_dev(inode); - WARN_ONCE(dax_dev->alive, + WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), "kill_dax() must be called before final iput()\n"); call_rcu(&inode->i_rcu, dax_i_callback); } @@ -309,7 +314,7 @@ static struct dax_device *dax_dev_get(dev_t devt) dax_dev = to_dax_dev(inode); if (inode->i_state & I_NEW) { - dax_dev->alive = true; + set_bit(DAXDEV_ALIVE, &dax_dev->flags); inode->i_cdev = &dax_dev->cdev; inode->i_mode = S_IFCHR; inode->i_flags = S_DAX; -- cgit 1.4.1 From 6e0c90d691cd5d90569f5918ab03eb76c81f9c6e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 26 Jun 2017 21:28:41 -0700 Subject: libnvdimm, pmem, dax: export a cache control attribute The dax_flush() operation can be turned into a nop on platforms where firmware arranges for cpu caches to be flushed on a power-fail event. The ACPI 6.2 specification defines a mechanism for the platform to indicate this capability so the kernel can select the proper default. However, for other platforms, the administrator must toggle this setting manually. Given this flush setting is a dax-specific mechanism we advertise it through a 'dax' attribute group hanging off a host device. For example, a 'pmem0' block-device gets a 'dax' sysfs-subdirectory with a 'write_cache' attribute to control response to dax cache flush requests. This is similar to the 'queue/write_cache' attribute that appears under block devices. Cc: Jan Kara Cc: Jeff Moyer Cc: Matthew Wilcox Cc: Ross Zwisler Suggested-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/super.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvdimm/pmem.c | 10 +++++++ include/linux/dax.h | 3 ++ 3 files changed, 92 insertions(+) (limited to 'drivers/dax') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 8bf71195921b..4827251782a1 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -119,6 +119,8 @@ EXPORT_SYMBOL_GPL(__bdev_dax_supported); enum dax_device_flags { /* !alive + rcu grace period == no new operations / mappings */ DAXDEV_ALIVE, + /* gate whether dax_flush() calls the low level flush routine */ + DAXDEV_WRITE_CACHE, }; /** @@ -139,6 +141,71 @@ struct dax_device { const struct dax_operations *ops; }; +static ssize_t write_cache_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + ssize_t rc; + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return -ENXIO; + + rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE, + &dax_dev->flags)); + put_dax(dax_dev); + return rc; +} + +static ssize_t write_cache_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool write_cache; + int rc = strtobool(buf, &write_cache); + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return -ENXIO; + + if (rc) + len = rc; + else if (write_cache) + set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + else + clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + + put_dax(dax_dev); + return len; +} +static DEVICE_ATTR_RW(write_cache); + +static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); + + WARN_ON_ONCE(!dax_dev); + if (!dax_dev) + return 0; + + if (a == &dev_attr_write_cache.attr && !dax_dev->ops->flush) + return 0; + return a->mode; +} + +static struct attribute *dax_attributes[] = { + &dev_attr_write_cache.attr, + NULL, +}; + +struct attribute_group dax_attribute_group = { + .name = "dax", + .attrs = dax_attributes, + .is_visible = dax_visible, +}; +EXPORT_SYMBOL_GPL(dax_attribute_group); + /** * dax_direct_access() - translate a device pgoff to an absolute pfn * @dax_dev: a dax_device instance representing the logical memory range @@ -194,11 +261,23 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, if (!dax_alive(dax_dev)) return; + if (!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)) + return; + if (dax_dev->ops->flush) dax_dev->ops->flush(dax_dev, pgoff, addr, size); } EXPORT_SYMBOL_GPL(dax_flush); +void dax_write_cache(struct dax_device *dax_dev, bool wc) +{ + if (wc) + set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); + else + clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); +} +EXPORT_SYMBOL_GPL(dax_write_cache); + bool dax_alive(struct dax_device *dax_dev) { lockdep_assert_held(&dax_srcu); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 06f6c27ec1e9..7339d184070e 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -253,6 +253,11 @@ static const struct dax_operations pmem_dax_ops = { .flush = pmem_dax_flush, }; +static const struct attribute_group *pmem_attribute_groups[] = { + &dax_attribute_group, + NULL, +}; + static void pmem_release_queue(void *q) { blk_cleanup_queue(q); @@ -287,6 +292,7 @@ static int pmem_attach_disk(struct device *dev, struct pmem_device *pmem; struct resource pfn_res; struct request_queue *q; + struct device *gendev; struct gendisk *disk; void *addr; @@ -384,8 +390,12 @@ static int pmem_attach_disk(struct device *dev, put_disk(disk); return -ENOMEM; } + dax_write_cache(dax_dev, true); pmem->dax_dev = dax_dev; + gendev = disk_to_dev(disk); + gendev->groups = pmem_attribute_groups; + device_add_disk(dev, disk); if (devm_add_action_or_reset(dev, pmem_release_disk, pmem)) return -ENOMEM; diff --git a/include/linux/dax.h b/include/linux/dax.h index 73fca1bebaf3..8f39db7439c3 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -23,6 +23,8 @@ struct dax_operations { void (*flush)(struct dax_device *, pgoff_t, void *, size_t); }; +extern struct attribute_group dax_attribute_group; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *dax_get_by_host(const char *host); void put_dax(struct dax_device *dax_dev); @@ -84,6 +86,7 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t size); +void dax_write_cache(struct dax_device *dax_dev, bool wc); /* * We use lowest available bit in exceptional entry for locking, one bit for -- cgit 1.4.1