summary refs log tree commit diff
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-08-30 14:26:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-08-30 14:26:36 -0700
commite5e726f7bb9f711102edea7e5bd511835640e3b4 (patch)
treee9f2d1696cd7a9664a04735568b2adbd2527e2e0 /kernel/sched
parent08403e2174c4ac8b23922b5b7abe670129f8acb5 (diff)
parenta055fcc132d4c25b96d1115aea514258810dc6fc (diff)
downloadlinux-e5e726f7bb9f711102edea7e5bd511835640e3b4.tar.gz
Merge tag 'locking-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking and atomics updates from Thomas Gleixner:
 "The regular pile:

   - A few improvements to the mutex code

   - Documentation updates for atomics to clarify the difference between
     cmpxchg() and try_cmpxchg() and to explain the forward progress
     expectations.

   - Simplification of the atomics fallback generator

   - The addition of arch_atomic_long*() variants and generic arch_*()
     bitops based on them.

   - Add the missing might_sleep() invocations to the down*() operations
     of semaphores.

  The PREEMPT_RT locking core:

   - Scheduler updates to support the state preserving mechanism for
     'sleeping' spin- and rwlocks on RT.

     This mechanism is carefully preserving the state of the task when
     blocking on a 'sleeping' spin- or rwlock and takes regular wake-ups
     targeted at the same task into account. The preserved or updated
     (via a regular wakeup) state is restored when the lock has been
     acquired.

   - Restructuring of the rtmutex code so it can be utilized and
     extended for the RT specific lock variants.

   - Restructuring of the ww_mutex code to allow sharing of the ww_mutex
     specific functionality for rtmutex based ww_mutexes.

   - Header file disentangling to allow substitution of the regular lock
     implementations with the PREEMPT_RT variants without creating an
     unmaintainable #ifdef mess.

   - Shared base code for the PREEMPT_RT specific rw_semaphore and
     rwlock implementations.

     Contrary to the regular rw_semaphores and rwlocks the PREEMPT_RT
     implementation is writer unfair because it is infeasible to do
     priority inheritance on multiple readers. Experience over the years
     has shown that real-time workloads are not the typical workloads
     which are sensitive to writer starvation.

     The alternative solution would be to allow only a single reader
     which has been tried and discarded as it is a major bottleneck
     especially for mmap_sem. Aside of that many of the writer
     starvation critical usage sites have been converted to a writer
     side mutex/spinlock and RCU read side protections in the past
     decade so that the issue is less prominent than it used to be.

   - The actual rtmutex based lock substitutions for PREEMPT_RT enabled
     kernels which affect mutex, ww_mutex, rw_semaphore, spinlock_t and
     rwlock_t. The spin/rw_lock*() functions disable migration across
     the critical section to preserve the existing semantics vs per-CPU
     variables.

   - Rework of the futex REQUEUE_PI mechanism to handle the case of
     early wake-ups which interleave with a re-queue operation to
     prevent the situation that a task would be blocked on both the
     rtmutex associated to the outer futex and the rtmutex based hash
     bucket spinlock.

     While this situation cannot happen on !RT enabled kernels the
     changes make the underlying concurrency problems easier to
     understand in general. As a result the difference between !RT and
     RT kernels is reduced to the handling of waiting for the critical
     section. !RT kernels simply spin-wait as before and RT kernels
     utilize rcu_wait().

   - The substitution of local_lock for PREEMPT_RT with a spinlock which
     protects the critical section while staying preemptible. The CPU
     locality is established by disabling migration.

  The underlying concepts of this code have been in use in PREEMPT_RT for
  way more than a decade. The code has been refactored several times over
  the years and this final incarnation has been optimized once again to be
  as non-intrusive as possible, i.e. the RT specific parts are mostly
  isolated.

  It has been extensively tested in the 5.14-rt patch series and it has
  been verified that !RT kernels are not affected by these changes"

* tag 'locking-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (92 commits)
  locking/rtmutex: Return success on deadlock for ww_mutex waiters
  locking/rtmutex: Prevent spurious EDEADLK return caused by ww_mutexes
  locking/rtmutex: Dequeue waiter on ww_mutex deadlock
  locking/rtmutex: Dont dereference waiter lockless
  locking/semaphore: Add might_sleep() to down_*() family
  locking/ww_mutex: Initialize waiter.ww_ctx properly
  static_call: Update API documentation
  locking/local_lock: Add PREEMPT_RT support
  locking/spinlock/rt: Prepare for RT local_lock
  locking/rtmutex: Add adaptive spinwait mechanism
  locking/rtmutex: Implement equal priority lock stealing
  preempt: Adjust PREEMPT_LOCK_OFFSET for RT
  locking/rtmutex: Prevent lockdep false positive with PI futexes
  futex: Prevent requeue_pi() lock nesting issue on RT
  futex: Simplify handle_early_requeue_pi_wakeup()
  futex: Reorder sanity checks in futex_requeue()
  futex: Clarify comment in futex_requeue()
  futex: Restructure futex_requeue()
  futex: Correct the number of requeued waiters for PI
  futex: Remove bogus condition for requeue PI
  ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c109
1 files changed, 92 insertions, 17 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 37bec9b05e31..c4462c454ab9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3776,6 +3776,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 }
 
 /*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ *   The related locking code always holds p::pi_lock when updating
+ *   p::saved_state, which means the code is fully serialized in both cases.
+ *
+ *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ *   bits set. This allows to distinguish all wakeup scenarios.
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+		WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+			     state != TASK_RTLOCK_WAIT);
+	}
+
+	if (READ_ONCE(p->__state) & state) {
+		*success = 1;
+		return true;
+	}
+
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Saved state preserves the task state across blocking on
+	 * an RT lock.  If the state matches, set p::saved_state to
+	 * TASK_RUNNING, but do not wake the task because it waits
+	 * for a lock wakeup. Also indicate success because from
+	 * the regular waker's point of view this has succeeded.
+	 *
+	 * After acquiring the lock the task will restore p::__state
+	 * from p::saved_state which ensures that the regular
+	 * wakeup is not lost. The restore will also set
+	 * p::saved_state to TASK_RUNNING so any further tests will
+	 * not result in false positives vs. @success
+	 */
+	if (p->saved_state & state) {
+		p->saved_state = TASK_RUNNING;
+		*success = 1;
+	}
+#endif
+	return false;
+}
+
+/*
  * Notes on Program-Order guarantees on SMP systems.
  *
  *  MIGRATION
@@ -3914,10 +3963,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		 *  - we're serialized against set_special_state() by virtue of
 		 *    it disabling IRQs (this allows not taking ->pi_lock).
 		 */
-		if (!(READ_ONCE(p->__state) & state))
+		if (!ttwu_state_match(p, state, &success))
 			goto out;
 
-		success = 1;
 		trace_sched_waking(p);
 		WRITE_ONCE(p->__state, TASK_RUNNING);
 		trace_sched_wakeup(p);
@@ -3932,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	smp_mb__after_spinlock();
-	if (!(READ_ONCE(p->__state) & state))
+	if (!ttwu_state_match(p, state, &success))
 		goto unlock;
 
 	trace_sched_waking(p);
 
-	/* We're going to change ->state: */
-	success = 1;
-
 	/*
 	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
 	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
@@ -6061,6 +6106,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 #endif /* CONFIG_SCHED_CORE */
 
 /*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ * optimize the AND operation out and just check for zero.
+ */
+#define SM_NONE			0x0
+#define SM_PREEMPT		0x1
+#define SM_RTLOCK_WAIT		0x2
+
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT	(~0U)
+#else
+# define SM_MASK_PREEMPT	SM_PREEMPT
+#endif
+
+/*
  * __schedule() is the main scheduler function.
  *
  * The main means of driving the scheduler and thus entering this function are:
@@ -6099,7 +6162,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  *
  * WARNING: must be called with preemption disabled!
  */
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
@@ -6112,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt)
 	rq = cpu_rq(cpu);
 	prev = rq->curr;
 
-	schedule_debug(prev, preempt);
+	schedule_debug(prev, !!sched_mode);
 
 	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
 		hrtick_clear(rq);
 
 	local_irq_disable();
-	rcu_note_context_switch(preempt);
+	rcu_note_context_switch(!!sched_mode);
 
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
@@ -6152,7 +6215,7 @@ static void __sched notrace __schedule(bool preempt)
 	 *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
 	 */
 	prev_state = READ_ONCE(prev->__state);
-	if (!preempt && prev_state) {
+	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
 		if (signal_pending_state(prev_state, prev)) {
 			WRITE_ONCE(prev->__state, TASK_RUNNING);
 		} else {
@@ -6218,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt)
 		migrate_disable_switch(rq, prev);
 		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-		trace_sched_switch(preempt, prev, next);
+		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
 
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
@@ -6239,7 +6302,7 @@ void __noreturn do_task_dead(void)
 	/* Tell freezer to ignore us: */
 	current->flags |= PF_NOFREEZE;
 
-	__schedule(false);
+	__schedule(SM_NONE);
 	BUG();
 
 	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -6300,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void)
 	sched_submit_work(tsk);
 	do {
 		preempt_disable();
-		__schedule(false);
+		__schedule(SM_NONE);
 		sched_preempt_enable_no_resched();
 	} while (need_resched());
 	sched_update_worker(tsk);
@@ -6328,7 +6391,7 @@ void __sched schedule_idle(void)
 	 */
 	WARN_ON_ONCE(current->__state);
 	do {
-		__schedule(false);
+		__schedule(SM_NONE);
 	} while (need_resched());
 }
 
@@ -6363,6 +6426,18 @@ void __sched schedule_preempt_disabled(void)
 	preempt_disable();
 }
 
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+	do {
+		preempt_disable();
+		__schedule(SM_RTLOCK_WAIT);
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
@@ -6381,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void)
 		 */
 		preempt_disable_notrace();
 		preempt_latency_start(1);
-		__schedule(true);
+		__schedule(SM_PREEMPT);
 		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 
@@ -6460,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		 * an infinite recursion.
 		 */
 		prev_ctx = exception_enter();
-		__schedule(true);
+		__schedule(SM_PREEMPT);
 		exception_exit(prev_ctx);
 
 		preempt_latency_stop(1);
@@ -6609,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	do {
 		preempt_disable();
 		local_irq_enable();
-		__schedule(true);
+		__schedule(SM_PREEMPT);
 		local_irq_disable();
 		sched_preempt_enable_no_resched();
 	} while (need_resched());