diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 4 | ||||
-rw-r--r-- | kernel/crash_core.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/hung_task.c | 11 | ||||
-rw-r--r-- | kernel/iomem.c | 167 | ||||
-rw-r--r-- | kernel/irq/manage.c | 38 | ||||
-rw-r--r-- | kernel/irq/migration.c | 31 | ||||
-rw-r--r-- | kernel/memremap.c | 210 | ||||
-rw-r--r-- | kernel/resource.c | 1 | ||||
-rw-r--r-- | kernel/rseq.c | 357 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/signal.c | 24 | ||||
-rw-r--r-- | kernel/sys.c | 10 | ||||
-rw-r--r-- | kernel/sys_ni.c | 3 | ||||
-rw-r--r-- | kernel/workqueue.c | 1 |
15 files changed, 646 insertions, 217 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474..d2001624fe7a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -112,7 +112,9 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_ZONE_DEVICE) += memremap.o +obj-$(CONFIG_RSEQ) += rseq.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f7674d676889..b66aced5e8c2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/kernel/fork.c b/kernel/fork.c index 80b48a8fb47b..08c6e5e217a0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -899,6 +899,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); @@ -1899,6 +1900,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rseq_fork(p, clone_flags); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 751593ed7c0b..32b479468e4d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; +static bool hung_task_call_panic; static struct task_struct *watchdog_task; @@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); if (sysctl_hung_task_panic) { - if (hung_task_show_lock) - debug_show_all_locks(); - trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); + hung_task_show_lock = true; + hung_task_call_panic = true; } } @@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); + if (hung_task_call_panic) { + trigger_all_cpu_backtrace(); + panic("hung_task: blocked tasks"); + } } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/iomem.c b/kernel/iomem.c new file mode 100644 index 000000000000..f7525e14ebc6 --- /dev/null +++ b/kernel/iomem.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + /* In the simple case just return the existing linear address */ + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) + return __va(offset); + + return NULL; /* fallback to arch_memremap_wb */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. + * + * MEMREMAP_WB - matches the default mapping for System RAM on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + void *addr = NULL; + + if (!flags) + return NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in System RAM. + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size, flags); + if (!addr) + addr = arch_memremap_wb(offset, size); + } + + /* + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * System RAM. + */ + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) + addr = ioremap_wt(offset, size); + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(*(void **)res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); + if (!ptr) + return ERR_PTR(-ENOMEM); + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + return ERR_PTR(-ENXIO); + } + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); +} +EXPORT_SYMBOL(devm_memunmap); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e3336d904f64..daeabd791d58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -24,6 +24,7 @@ #ifdef CONFIG_IRQ_FORCED_THREADING __read_mostly bool force_irqthreads; +EXPORT_SYMBOL_GPL(force_irqthreads); static int __init setup_forced_irqthreads(char *arg) { @@ -204,6 +205,39 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + irqd_set_move_pending(data); + irq_copy_pending(desc, dest); + return 0; +} +#else +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + return -EBUSY; +} +#endif + +static int irq_try_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + int ret = irq_do_set_affinity(data, dest, force); + + /* + * In case that the underlying vector management is busy and the + * architecture supports the generic pending mechanism then utilize + * this to avoid returning an error to user space. + */ + if (ret == -EBUSY && !force) + ret = irq_set_affinity_pending(data, dest); + return ret; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -214,8 +248,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, force); + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { + ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 86ae0eb80b53..def48589ea48 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -38,17 +38,18 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_data *data = &desc->irq_data; + struct irq_chip *chip = data->chip; - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + if (likely(!irqd_is_setaffinity_pending(data))) return; - irqd_clr_move_pending(&desc->irq_data); + irqd_clr_move_pending(data); /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (irqd_is_per_cpu(&desc->irq_data)) { + if (irqd_is_per_cpu(data)) { WARN_ON(1); return; } @@ -73,13 +74,24 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) - irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); - + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + int ret; + + ret = irq_do_set_affinity(data, desc->pending_mask, false); + /* + * If the there is a cleanup pending in the underlying + * vector management, reschedule the move for the next + * interrupt. Leave desc->pending_mask intact. + */ + if (ret == -EBUSY) { + irqd_set_move_pending(data); + return; + } + } cpumask_clear(desc->pending_mask); } -void irq_move_irq(struct irq_data *idata) +void __irq_move_irq(struct irq_data *idata) { bool masked; @@ -90,9 +102,6 @@ void irq_move_irq(struct irq_data *idata) */ idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); - if (likely(!irqd_is_setaffinity_pending(idata))) - return; - if (unlikely(irqd_irq_disabled(idata))) return; diff --git a/kernel/memremap.c b/kernel/memremap.c index 895e6b76b25e..5857267a4af5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,15 +1,5 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include <linux/radix-tree.h> #include <linux/device.h> #include <linux/types.h> @@ -19,170 +9,8 @@ #include <linux/memory_hotplug.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/wait_bit.h> -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif - -#ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) -{ - return (__force void *)ioremap_cache(offset, size); -} -#endif - -#ifndef arch_memremap_can_ram_remap -static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - return true; -} -#endif - -static void *try_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - unsigned long pfn = PHYS_PFN(offset); - - /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && - arch_memremap_can_ram_remap(offset, size, flags)) - return __va(offset); - - return NULL; /* fallback to arch_memremap_wb */ -} - -/** - * memremap() - remap an iomem_resource as cacheable memory - * @offset: iomem resource start address - * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, - * MEMREMAP_ENC, MEMREMAP_DEC - * - * memremap() is "ioremap" for cases where it is known that the resource - * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. In the case of multiple flags, the different - * mapping types will be attempted in the order listed below until one of - * them succeeds. - * - * MEMREMAP_WB - matches the default mapping for System RAM on - * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM - * memremap() will bypass establishing a new mapping and instead return - * a pointer into the direct map. - * - * MEMREMAP_WT - establish a mapping whereby writes either bypass the - * cache or are written through to memory and never exist in a - * cache-dirty state with respect to program visibility. Attempts to - * map System RAM with this mapping type will fail. - * - * MEMREMAP_WC - establish a writecombine mapping, whereby writes may - * be coalesced together (e.g. in the CPU's write buffers), but is otherwise - * uncached. Attempts to map System RAM with this mapping type will fail. - */ -void *memremap(resource_size_t offset, size_t size, unsigned long flags) -{ - int is_ram = region_intersects(offset, size, - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - void *addr = NULL; - - if (!flags) - return NULL; - - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - /* Try all mapping types requested until one returns non-NULL */ - if (flags & MEMREMAP_WB) { - /* - * MEMREMAP_WB is special in that it can be satisifed - * from the direct map. Some archs depend on the - * capability of memremap() to autodetect cases where - * the requested range is potentially in System RAM. - */ - if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size, flags); - if (!addr) - addr = arch_memremap_wb(offset, size); - } - - /* - * If we don't have a mapping yet and other request flags are - * present then we will be attempting to establish a new virtual - * address mapping. Enforce that this mapping is not aliasing - * System RAM. - */ - if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { - WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - if (!addr && (flags & MEMREMAP_WT)) - addr = ioremap_wt(offset, size); - - if (!addr && (flags & MEMREMAP_WC)) - addr = ioremap_wc(offset, size); - - return addr; -} -EXPORT_SYMBOL(memremap); - -void memunmap(void *addr) -{ - if (is_vmalloc_addr(addr)) - iounmap((void __iomem *) addr); -} -EXPORT_SYMBOL(memunmap); - -static void devm_memremap_release(struct device *dev, void *res) -{ - memunmap(*(void **)res); -} - -static int devm_memremap_match(struct device *dev, void *res, void *match_data) -{ - return *(void **)res == match_data; -} - -void *devm_memremap(struct device *dev, resource_size_t offset, - size_t size, unsigned long flags) -{ - void **ptr, *addr; - - ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, - dev_to_node(dev)); - if (!ptr) - return ERR_PTR(-ENOMEM); - - addr = memremap(offset, size, flags); - if (addr) { - *ptr = addr; - devres_add(dev, ptr); - } else { - devres_free(ptr); - return ERR_PTR(-ENXIO); - } - - return addr; -} -EXPORT_SYMBOL(devm_memremap); - -void devm_memunmap(struct device *dev, void *addr) -{ - WARN_ON(devres_release(dev, devm_memremap_release, - devm_memremap_match, addr)); -} -EXPORT_SYMBOL(devm_memunmap); - -#ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return pgmap; } -#endif /* CONFIG_ZONE_DEVICE */ +EXPORT_SYMBOL_GPL(get_dev_pagemap); + +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL_GPL(devmap_managed_key); +static atomic_t devmap_enable; + +/* + * Toggle the static key for ->page_free() callbacks when dev_pagemap + * pages go idle. + */ +void dev_pagemap_get_ops(void) +{ + if (atomic_inc_return(&devmap_enable) == 1) + static_branch_enable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); + +void dev_pagemap_put_ops(void) +{ + if (atomic_dec_and_test(&devmap_enable)) + static_branch_disable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -void put_zone_device_private_or_public_page(struct page *page) +void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_or_public_page); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ +EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +#endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/resource.c b/kernel/resource.c index b589dda910b3..30e1bc68503b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -415,6 +415,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, return __walk_iomem_res_desc(&res, desc, false, arg, func); } +EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* * This function calls the @func callback against all memory ranges of type diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..ae306f90c514 --- /dev/null +++ b/kernel/rseq.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Restartable sequences system call + * + * Copyright (C) 2015, Google, Inc., + * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> + * Copyright (C) 2015-2018, EfficiOS Inc., + * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + */ + +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/rseq.h> +#include <linux/types.h> +#include <asm/ptrace.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> + +#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) + +/* + * + * Restartable sequences are a lightweight interface that allows + * user-level code to be executed atomically relative to scheduler + * preemption and signal delivery. Typically used for implementing + * per-cpu operations. + * + * It allows user-space to perform update operations on per-cpu data + * without requiring heavy-weight atomic operations. + * + * Detailed algorithm of rseq user-space assembly sequences: + * + * init(rseq_cs) + * cpu = TLS->rseq::cpu_id_start + * [1] TLS->rseq::rseq_cs = rseq_cs + * [start_ip] ---------------------------- + * [2] if (cpu != TLS->rseq::cpu_id) + * goto abort_ip; + * [3] <last_instruction_in_cs> + * [post_commit_ip] ---------------------------- + * + * The address of jump target abort_ip must be outside the critical + * region, i.e.: + * + * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] + * + * Steps [2]-[3] (inclusive) need to be a sequence of instructions in + * userspace that can handle being interrupted between any of those + * instructions, and then resumed to the abort_ip. + * + * 1. Userspace stores the address of the struct rseq_cs assembly + * block descriptor into the rseq_cs field of the registered + * struct rseq TLS area. This update is performed through a single + * store within the inline assembly instruction sequence. + * [start_ip] + * + * 2. Userspace tests to check whether the current cpu_id field match + * the cpu number loaded before start_ip, branching to abort_ip + * in case of a mismatch. + * + * If the sequence is preempted or interrupted by a signal + * at or after start_ip and before post_commit_ip, then the kernel + * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return + * ip to abort_ip before returning to user-space, so the preempted + * execution resumes at abort_ip. + * + * 3. Userspace critical section final instruction before + * post_commit_ip is the commit. The critical section is + * self-terminating. + * [post_commit_ip] + * + * 4. <success> + * + * On failure at [2], or if interrupted by preempt or signal delivery + * between [1] and [3]: + * + * [abort_ip] + * F1. <failure> + */ + +static int rseq_update_cpu_id(struct task_struct *t) +{ + u32 cpu_id = raw_smp_processor_id(); + + if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + return -EFAULT; + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + trace_rseq_update(t); + return 0; +} + +static int rseq_reset_rseq_cpu_id(struct task_struct *t) +{ + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + + /* + * Reset cpu_id_start to its initial state (0). + */ + if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + return -EFAULT; + /* + * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming + * in after unregistration can figure out that rseq needs to be + * registered again. + */ + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + return 0; +} + +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +{ + struct rseq_cs __user *urseq_cs; + unsigned long ptr; + u32 __user *usig; + u32 sig; + int ret; + + ret = __get_user(ptr, &t->rseq->rseq_cs); + if (ret) + return ret; + if (!ptr) { + memset(rseq_cs, 0, sizeof(*rseq_cs)); + return 0; + } + urseq_cs = (struct rseq_cs __user *)ptr; + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return -EFAULT; + if (rseq_cs->version > 0) + return -EINVAL; + + /* Ensure that abort_ip is not in the critical section. */ + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) + return -EINVAL; + + usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + ret = get_user(sig, usig); + if (ret) + return ret; + + if (current->rseq_sig != sig) { + printk_ratelimited(KERN_WARNING + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", + sig, current->rseq_sig, current->pid, usig); + return -EPERM; + } + return 0; +} + +static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +{ + u32 flags, event_mask; + int ret; + + /* Get thread flags. */ + ret = __get_user(flags, &t->rseq->flags); + if (ret) + return ret; + + /* Take critical section flags into account. */ + flags |= cs_flags; + + /* + * Restart on signal can only be inhibited when restart on + * preempt and restart on migrate are inhibited too. Otherwise, + * a preempted signal handler could fail to restart the prior + * execution context on sigreturn. + */ + if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && + (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != + RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) + return -EINVAL; + + /* + * Load and clear event mask atomically with respect to + * scheduler preemption. + */ + preempt_disable(); + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + preempt_enable(); + + return !!(event_mask & ~flags); +} + +static int clear_rseq_cs(struct task_struct *t) +{ + /* + * The rseq_cs field is set to NULL on preemption or signal + * delivery on top of rseq assembly block, as well as on top + * of code outside of the rseq assembly block. This performs + * a lazy clear of the rseq_cs field. + * + * Set rseq_cs to NULL with single-copy atomicity. + */ + return __put_user(0UL, &t->rseq->rseq_cs); +} + +/* + * Unsigned comparison will be true when ip >= start_ip, and when + * ip < start_ip + post_commit_offset. + */ +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +{ + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; +} + +static int rseq_ip_fixup(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + int ret; + + ret = rseq_get_rseq_cs(t, &rseq_cs); + if (ret) + return ret; + + /* + * Handle potentially not being within a critical section. + * If not nested over a rseq critical section, restart is useless. + * Clear the rseq_cs pointer and return. + */ + if (!in_rseq_cs(ip, &rseq_cs)) + return clear_rseq_cs(t); + ret = rseq_need_restart(t, rseq_cs.flags); + if (ret <= 0) + return ret; + ret = clear_rseq_cs(t); + if (ret) + return ret; + trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, + rseq_cs.abort_ip); + instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); + return 0; +} + +/* + * This resume handler must always be executed between any of: + * - preemption, + * - signal delivery, + * and return to user-space. + * + * This is how we can ensure that the entire rseq critical section, + * consisting of both the C part and the assembly instruction sequence, + * will issue the commit instruction only if executed atomically with + * respect to other threads scheduled on the same CPU, and with respect + * to signal handlers. + */ +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + struct task_struct *t = current; + int ret; + + if (unlikely(t->flags & PF_EXITING)) + return; + if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) + goto error; + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + if (unlikely(rseq_update_cpu_id(t))) + goto error; + return; + +error: + force_sig(SIGSEGV, t); +} + +#ifdef CONFIG_DEBUG_RSEQ + +/* + * Terminate the process if a syscall is issued within a restartable + * sequence. + */ +void rseq_syscall(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + + if (!t->rseq) + return; + if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || + rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) + force_sig(SIGSEGV, t); +} + +#endif + +/* + * sys_rseq - setup restartable sequences for caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, + int, flags, u32, sig) +{ + int ret; + + if (flags & RSEQ_FLAG_UNREGISTER) { + /* Unregister rseq for current thread. */ + if (current->rseq != rseq || !current->rseq) + return -EINVAL; + if (current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + ret = rseq_reset_rseq_cpu_id(current); + if (ret) + return ret; + current->rseq = NULL; + current->rseq_len = 0; + current->rseq_sig = 0; + return 0; + } + + if (unlikely(flags)) + return -EINVAL; + + if (current->rseq) { + /* + * If rseq is already registered, check whether + * the provided address differs from the prior + * one. + */ + if (current->rseq != rseq || current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; + } + + /* + * If there was no rseq previously registered, + * ensure the provided rseq is properly aligned and valid. + */ + if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len != sizeof(*rseq)) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) + return -EFAULT; + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + /* + * If rseq was previously inactive, and has just been + * registered, ensure the cpu_id_start and cpu_id fields + * are updated before returning to user-space. + */ + rseq_set_notify_resume(current); + + return 0; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9866f86f304..a98d54cd5535 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1191,6 +1191,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; + rseq_migrate(p); perf_event_task_migrate(p); } @@ -2634,6 +2635,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, { sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); + rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); prepare_task(next); prepare_arch_switch(next); diff --git a/kernel/signal.c b/kernel/signal.c index 0f865d67415d..8d8a940422a8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1244,19 +1244,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, { struct sighand_struct *sighand; + rcu_read_lock(); for (;;) { - /* - * Disable interrupts early to avoid deadlocks. - * See rcu_read_unlock() comment header for details. - */ - local_irq_save(*flags); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) { - rcu_read_unlock(); - local_irq_restore(*flags); + if (unlikely(sighand == NULL)) break; - } + /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which @@ -1268,15 +1261,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ - spin_lock(&sighand->siglock); - if (likely(sighand == tsk->sighand)) { - rcu_read_unlock(); + spin_lock_irqsave(&sighand->siglock, *flags); + if (likely(sighand == tsk->sighand)) break; - } - spin_unlock(&sighand->siglock); - rcu_read_unlock(); - local_irq_restore(*flags); + spin_unlock_irqrestore(&sighand->siglock, *flags); } + rcu_read_unlock(); return sighand; } diff --git a/kernel/sys.c b/kernel/sys.c index d1b2b8d934bb..38509dc1f77b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 06b4ccee0047..df556175be50 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16); COND_SYSCALL(setresuid16); COND_SYSCALL(setreuid16); COND_SYSCALL(setuid16); + +/* restartable sequence */ +COND_SYSCALL(rseq); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9f9983b0a27d..465a28b4cd32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4362,6 +4362,7 @@ void set_worker_desc(const char *fmt, ...) va_end(args); } } +EXPORT_SYMBOL_GPL(set_worker_desc); /** * print_worker_info - print out worker information and description |