summary refs log tree commit diff
path: root/fs/btrfs/dev-replace.c
diff options
context:
space:
mode:
authorLiu Bo <bo.li.liu@oracle.com>2015-07-17 16:49:19 +0800
committerDavid Sterba <dsterba@suse.com>2016-02-23 13:10:10 +0100
commit73beece9ca07c003e0e4f4825b12be167334d4ad (patch)
treeb0575466fc5e0c7a493826963e7a4ffa49e31210 /fs/btrfs/dev-replace.c
parent18558cae0272f8fd9647e69d3fec1565a7949865 (diff)
downloadlinux-73beece9ca07c003e0e4f4825b12be167334d4ad.tar.gz
Btrfs: fix lockdep deadlock warning due to dev_replace
Xfstests btrfs/011 complains about a deadlock warning,

[ 1226.649039] =========================================================
[ 1226.649039] [ INFO: possible irq lock inversion dependency detected ]
[ 1226.649039] 4.1.0+ #270 Not tainted
[ 1226.649039] ---------------------------------------------------------
[ 1226.652955] kswapd0/46 just changed the state of lock:
[ 1226.652955]  (&delayed_node->mutex){+.+.-.}, at: [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955] but this lock took another, RECLAIM_FS-unsafe lock in the past:
[ 1226.652955]  (&fs_info->dev_replace.lock){+.+.+.}

and interrupts could create inverse lock ordering between them.

[ 1226.652955]
other info that might help us debug this:
[ 1226.652955] Chain exists of:
  &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock

[ 1226.652955]  Possible interrupt unsafe locking scenario:

[ 1226.652955]        CPU0                    CPU1
[ 1226.652955]        ----                    ----
[ 1226.652955]   lock(&fs_info->dev_replace.lock);
[ 1226.652955]                                local_irq_disable();
[ 1226.652955]                                lock(&delayed_node->mutex);
[ 1226.652955]                                lock(&found->groups_sem);
[ 1226.652955]   <Interrupt>
[ 1226.652955]     lock(&delayed_node->mutex);
[ 1226.652955]
 *** DEADLOCK ***

Commit 084b6e7c7607 ("btrfs: Fix a lockdep warning when running xfstest.") tried
to fix a similar one that has the exactly same warning, but with that, we still
run to this.

The above lock chain comes from
btrfs_commit_transaction
  ->btrfs_run_delayed_items
    ...
    ->__btrfs_update_delayed_inode
      ...
      ->__btrfs_cow_block
         ...
         ->find_free_extent
            ->cache_block_group
              ->load_free_space_cache
                ->btrfs_readpages
                  ->submit_one_bio
                    ...
                    ->__btrfs_map_block
                      ->btrfs_dev_replace_lock

However, with high memory pressure, tasks which hold dev_replace.lock can
be interrupted by kswapd and then kswapd is intended to release memory occupied
by superblock, inodes and dentries, where we may call evict_inode, and it comes
to

[ 1226.652955]  [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955]  [<ffffffff81459e74>] btrfs_remove_delayed_node+0x24/0x30
[ 1226.652955]  [<ffffffff8140c5fe>] btrfs_evict_inode+0x34e/0x700

delayed_node->mutex may be acquired in __btrfs_release_delayed_node(), and it leads
to a ABBA deadlock.

To fix this, we can use "blocking rwlock" used in the case of extent_buffer, but
things are simpler here since we only needs read's spinlock to blocking lock.

With this, btrfs/011 no more produces warnings in dmesg.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/dev-replace.c')
-rw-r--r--fs/btrfs/dev-replace.c130
1 files changed, 70 insertions, 60 deletions
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cbb7dbfb3fff..8c8b48971bc7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	struct btrfs_dev_replace_item *ptr;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 0);
 	if (!dev_replace->is_valid ||
 	    !dev_replace->item_needs_writeback) {
-		btrfs_dev_replace_unlock(dev_replace);
+		btrfs_dev_replace_unlock(dev_replace, 0);
 		return 0;
 	}
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 0);
 
 	key.objectid = 0;
 	key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	ptr = btrfs_item_ptr(eb, path->slots[0],
 			     struct btrfs_dev_replace_item);
 
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 1);
 	if (dev_replace->srcdev)
 		btrfs_set_dev_replace_src_devid(eb, ptr,
 			dev_replace->srcdev->devid);
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_replace_cursor_right(eb, ptr,
 		dev_replace->cursor_right);
 	dev_replace->item_needs_writeback = 0;
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 
 	btrfs_mark_buffer_dirty(eb);
 
@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 		return PTR_ERR(trans);
 	}
 
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 1);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	dev_replace->is_valid = 1;
 	dev_replace->item_needs_writeback = 1;
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 
 	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
 	if (ret)
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		btrfs_dev_replace_lock(dev_replace);
+		btrfs_dev_replace_lock(dev_replace, 1);
 		goto leave;
 	}
 
@@ -433,7 +433,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 leave:
 	dev_replace->srcdev = NULL;
 	dev_replace->tgtdev = NULL;
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 	btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 	return ret;
 }
@@ -471,18 +471,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	/* don't allow cancel or unmount to disturb the finishing procedure */
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 0);
 	/* was the operation canceled, or is it finished? */
 	if (dev_replace->replace_state !=
 	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
-		btrfs_dev_replace_unlock(dev_replace);
+		btrfs_dev_replace_unlock(dev_replace, 0);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return 0;
 	}
 
 	tgt_device = dev_replace->tgtdev;
 	src_device = dev_replace->srcdev;
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 0);
 
 	/*
 	 * flush all outstanding I/O and inode extent mappings before the
@@ -507,7 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	/* keep away write_all_supers() during the finishing procedure */
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	mutex_lock(&root->fs_info->chunk_mutex);
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 1);
 	dev_replace->replace_state =
 		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -528,7 +528,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			        rcu_str_deref(src_device->name),
 			      src_device->devid,
 			      rcu_str_deref(tgt_device->name), scrub_ret);
-		btrfs_dev_replace_unlock(dev_replace);
+		btrfs_dev_replace_unlock(dev_replace, 1);
 		mutex_unlock(&root->fs_info->chunk_mutex);
 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 		mutex_unlock(&uuid_mutex);
@@ -565,7 +565,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 	fs_info->fs_devices->rw_devices++;
 
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 
 	btrfs_rm_dev_replace_blocked(fs_info);
 
@@ -649,7 +649,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *srcdev;
 
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 0);
 	/* even if !dev_replace_is_valid, the values are good enough for
 	 * the replace_status ioctl */
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -675,7 +675,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 			div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
 		break;
 	}
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 0);
 }
 
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
@@ -698,13 +698,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 		return -EROFS;
 
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 1);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
-		btrfs_dev_replace_unlock(dev_replace);
+		btrfs_dev_replace_unlock(dev_replace, 1);
 		goto leave;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -717,7 +717,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
 	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 	btrfs_scrub_cancel(fs_info);
 
 	trans = btrfs_start_transaction(root, 0);
@@ -740,7 +740,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 1);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -756,7 +756,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 		break;
 	}
 
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 }
 
@@ -766,12 +766,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 	struct task_struct *task;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_lock(dev_replace);
+	btrfs_dev_replace_lock(dev_replace, 1);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
-		btrfs_dev_replace_unlock(dev_replace);
+		btrfs_dev_replace_unlock(dev_replace, 1);
 		return 0;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 		break;
@@ -784,10 +784,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 		btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
 		btrfs_info(fs_info,
 			"you may cancel the operation after 'mount -o degraded'");
-		btrfs_dev_replace_unlock(dev_replace);
+		btrfs_dev_replace_unlock(dev_replace, 1);
 		return 0;
 	}
-	btrfs_dev_replace_unlock(dev_replace);
+	btrfs_dev_replace_unlock(dev_replace, 1);
 
 	WARN_ON(atomic_xchg(
 		&fs_info->mutually_exclusive_operation_running, 1));
@@ -865,48 +865,58 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 	return 1;
 }
 
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
 {
-	/* the beginning is just an optimization for the typical case */
-	if (atomic_read(&dev_replace->nesting_level) == 0) {
-acquire_lock:
-		/* this is not a nested case where the same thread
-		 * is trying to acqurire the same lock twice */
-		mutex_lock(&dev_replace->lock);
-		mutex_lock(&dev_replace->lock_management_lock);
-		dev_replace->lock_owner = current->pid;
-		atomic_inc(&dev_replace->nesting_level);
-		mutex_unlock(&dev_replace->lock_management_lock);
-		return;
+	if (rw == 1) {
+		/* write */
+again:
+		wait_event(dev_replace->read_lock_wq,
+			   atomic_read(&dev_replace->blocking_readers) == 0);
+		write_lock(&dev_replace->lock);
+		if (atomic_read(&dev_replace->blocking_readers)) {
+			write_unlock(&dev_replace->lock);
+			goto again;
+		}
+	} else {
+		read_lock(&dev_replace->lock);
+		atomic_inc(&dev_replace->read_locks);
 	}
+}
 
-	mutex_lock(&dev_replace->lock_management_lock);
-	if (atomic_read(&dev_replace->nesting_level) > 0 &&
-	    dev_replace->lock_owner == current->pid) {
-		WARN_ON(!mutex_is_locked(&dev_replace->lock));
-		atomic_inc(&dev_replace->nesting_level);
-		mutex_unlock(&dev_replace->lock_management_lock);
-		return;
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
+{
+	if (rw == 1) {
+		/* write */
+		ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+		write_unlock(&dev_replace->lock);
+	} else {
+		ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+		atomic_dec(&dev_replace->read_locks);
+		read_unlock(&dev_replace->lock);
 	}
+}
 
-	mutex_unlock(&dev_replace->lock_management_lock);
-	goto acquire_lock;
+/* inc blocking cnt and release read lock */
+void btrfs_dev_replace_set_lock_blocking(
+					struct btrfs_dev_replace *dev_replace)
+{
+	/* only set blocking for read lock */
+	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+	atomic_inc(&dev_replace->blocking_readers);
+	read_unlock(&dev_replace->lock);
 }
 
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+/* acquire read lock and dec blocking cnt */
+void btrfs_dev_replace_clear_lock_blocking(
+					struct btrfs_dev_replace *dev_replace)
 {
-	WARN_ON(!mutex_is_locked(&dev_replace->lock));
-	mutex_lock(&dev_replace->lock_management_lock);
-	WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
-	WARN_ON(dev_replace->lock_owner != current->pid);
-	atomic_dec(&dev_replace->nesting_level);
-	if (atomic_read(&dev_replace->nesting_level) == 0) {
-		dev_replace->lock_owner = 0;
-		mutex_unlock(&dev_replace->lock_management_lock);
-		mutex_unlock(&dev_replace->lock);
-	} else {
-		mutex_unlock(&dev_replace->lock_management_lock);
-	}
+	/* only set blocking for read lock */
+	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+	ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
+	read_lock(&dev_replace->lock);
+	if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
+	    waitqueue_active(&dev_replace->read_lock_wq))
+		wake_up(&dev_replace->read_lock_wq);
 }
 
 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)