From 555f66d0f8a38537456acc77043d0e4469fcbe8e Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 14 Sep 2021 11:20:06 +0200 Subject: nvme-fc: update hardware queues before using them In case the number of hardware queues changes, we need to update the tagset and the mapping of ctx to hctx first. If we try to create and connect the I/O queues first, this operation will fail (target will reject the connect call due to the wrong number of queues) and hence we bail out of the recreate function. Then we will to try the very same operation again, thus we don't make any progress. Signed-off-by: Daniel Wagner Reviewed-by: Ming Lei Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b08a61ca283f..b5d9a5507de5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2951,14 +2951,6 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.queue_count == 1) return 0; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); - if (ret) - goto out_free_io_queues; - - ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); - if (ret) - goto out_delete_hw_queues; - if (prior_ioq_cnt != nr_io_queues) { dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", @@ -2968,6 +2960,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) nvme_unfreeze(&ctrl->ctrl); } + ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_free_io_queues; + + ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); + if (ret) + goto out_delete_hw_queues; + return 0; out_delete_hw_queues: -- cgit 1.4.1 From e5445dae29d25d7b03e0a10d3d4277a1d0c8119b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 Sep 2021 11:20:07 +0200 Subject: nvme-fc: avoid race between time out and tear down To avoid race between time out and tear down, in tear down process, first we quiesce the queue, and then delete the timer and cancel the time out work for the queue. This patch merges the admin and io sync ops into the queue teardown logic as shown in the RDMA patch 3017013dcc "nvme-rdma: avoid race between time out and tear down". There is no teardown_lock in nvme-fc. Signed-off-by: James Smart Tested-by: Daniel Wagner Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b5d9a5507de5..6ebe68396712 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2487,6 +2487,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) */ if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); @@ -2510,6 +2511,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * clean up the admin queue. Same thing as above. */ blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); -- cgit 1.4.1 From bdaa1365667103e7a754e87c08b846a979ce322b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 14 Sep 2021 11:20:08 +0200 Subject: nvme-fc: remove freeze/unfreeze around update_nr_hw_queues Remove the freeze/unfreeze around changes to the number of hardware queues. Study and retest has indicated there are no ios that can be active at this point so there is nothing to freeze. nvme-fc is draining the queues in the shutdown and error recovery path in __nvme_fc_abort_outstanding_ios. This patch primarily reverts 88e837ed0f1f "nvme-fc: wait for queues to freeze before calling update_hr_hw_queues". It's not an exact revert as it leaves the adjusting of hw queues only if the count changes. Signed-off-by: James Smart [dwagner: added explanation why no IO is pending] Signed-off-by: Daniel Wagner Reviewed-by: Ming Lei Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6ebe68396712..aa14ad963d91 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2957,9 +2957,7 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", prior_ioq_cnt, nr_io_queues); - nvme_wait_freeze(&ctrl->ctrl); blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); - nvme_unfreeze(&ctrl->ctrl); } ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); -- cgit 1.4.1 From e371af033c560b9dd1e861f8f0b503142bf0a06c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 14 Sep 2021 18:38:55 +0300 Subject: nvme-tcp: fix incorrect h2cdata pdu offset accounting When the controller sends us multiple r2t PDUs in a single request we need to account for it correctly as our send/recv context run concurrently (i.e. we get a new r2t with r2t_offset before we updated our iterator and req->data_sent marker). This can cause wrong offsets to be sent to the controller. To fix that, we will first know that this may happen only in the send sequence of the last page, hence we will take the r2t_offset to the h2c PDU data_offset, and in nvme_tcp_try_send_data loop, we make sure to increment the request markers also when we completed a PDU but we are expecting more r2t PDUs as we still did not send the entire data of the request. Fixes: 825619b09ad3 ("nvme-tcp: fix possible use-after-completion") Reported-by: Nowak, Lukasz Tested-by: Nowak, Lukasz Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e4249b7dc056..3c1c29dd3020 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -620,7 +620,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); data->ttag = pdu->ttag; data->command_id = nvme_cid(rq); - data->data_offset = cpu_to_le32(req->data_sent); + data->data_offset = pdu->r2t_offset; data->data_length = cpu_to_le32(req->pdu_len); return 0; } @@ -953,7 +953,15 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) nvme_tcp_ddgst_update(queue->snd_hash, page, offset, ret); - /* fully successful last write*/ + /* + * update the request iterator except for the last payload send + * in the request where we don't want to modify it as we may + * compete with the RX path completing the request. + */ + if (req->data_sent + ret < req->data_len) + nvme_tcp_advance_req(req, ret); + + /* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) { nvme_tcp_ddgst_final(queue->snd_hash, @@ -965,7 +973,6 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) } return 1; } - nvme_tcp_advance_req(req, ret); } return -EAGAIN; } -- cgit 1.4.1 From 298ba0e3d4af539cc37f982d4c011a0f07fca48c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Sep 2021 08:38:20 +0200 Subject: nvme: keep ctrl->namespaces ordered Various places in the nvme code that rely on ctrl->namespace to be ordered. Ensure that the namespae is inserted into the list at the right position from the start instead of sorting it after the fact. Fixes: 540c801c65eb ("NVMe: Implement namespace list scanning") Reported-by: Anton Eidelman Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal --- drivers/nvme/host/core.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6600e138945e..e486845d2c7e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -3716,15 +3715,6 @@ out_unlock: return ret; } -static int ns_cmp(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); - struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); - - return nsa->head->ns_id - nsb->head->ns_id; -} - struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; @@ -3745,6 +3735,22 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); +/* + * Add the namespace to the controller list while keeping the list ordered. + */ +static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) +{ + struct nvme_ns *tmp; + + list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { + if (tmp->head->ns_id < ns->head->ns_id) { + list_add(&ns->list, &tmp->list); + return; + } + } + list_add(&ns->list, &ns->ctrl->namespaces); +} + static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -3795,9 +3801,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_unlink_ns; down_write(&ctrl->namespaces_rwsem); - list_add_tail(&ns->list, &ctrl->namespaces); + nvme_ns_add_to_ctrl_list(ns); up_write(&ctrl->namespaces_rwsem); - nvme_get_ctrl(ctrl); if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) @@ -4080,10 +4085,6 @@ static void nvme_scan_work(struct work_struct *work) if (nvme_scan_ns_list(ctrl) != 0) nvme_scan_ns_sequential(ctrl); mutex_unlock(&ctrl->scan_lock); - - down_write(&ctrl->namespaces_rwsem); - list_sort(NULL, &ctrl->namespaces, ns_cmp); - up_write(&ctrl->namespaces_rwsem); } /* -- cgit 1.4.1 From 7df835a32a8bedf7ce88efcfa7c9b245b52ff139 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Sep 2021 13:38:29 +0200 Subject: md: fix a lock order reversal in md_alloc Commit b0140891a8cea3 ("md: Fix race when creating a new md device.") not only moved assigning mddev->gendisk before calling add_disk, which fixes the races described in the commit log, but also added a mddev->open_mutex critical section over add_disk and creation of the md kobj. Adding a kobject after add_disk is racy vs deleting the gendisk right after adding it, but md already prevents against that by holding a mddev->active reference. On the other hand taking this lock added a lock order reversal with what is not disk->open_mutex (used to be bdev->bd_mutex when the commit was added) for partition devices, which need that lock for the internal open for the partition scan, and a recent commit also takes it for non-partitioned devices, leading to further lockdep splatter. Fixes: b0140891a8ce ("md: Fix race when creating a new md device.") Fixes: d62633873590 ("block: support delayed holder registration") Reported-by: syzbot+fadc0aaf497e6a493b9f@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Tested-by: syzbot+fadc0aaf497e6a493b9f@syzkaller.appspotmail.com Reviewed-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/md.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index ae8fe54ea358..6c0c3d0d905a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5700,10 +5700,6 @@ static int md_alloc(dev_t dev, char *name) disk->flags |= GENHD_FL_EXT_DEVT; disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); add_disk(disk); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); @@ -5718,7 +5714,6 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) pr_debug("pointless warning\n"); - mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { -- cgit 1.4.1