From 02afc6267f6d55d47aba9fcafdbd1b7230d2294a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 May 2008 19:42:56 -0400 Subject: [PATCH] dup_fd() fixes, part 1 Move the sucker to fs/file.c in preparation to the rest Signed-off-by: Al Viro --- kernel/fork.c | 130 ---------------------------------------------------------- 1 file changed, 130 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 933e60ebccae..19908b26cf80 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -660,136 +660,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int count_open_files(struct fdtable *fdt) -{ - int size = fdt->max_fds; - int i; - - /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) - break; - } - i = (i+1) * 8 * sizeof(long); - return i; -} - -static struct files_struct *alloc_files(void) -{ - struct files_struct *newf; - struct fdtable *fdt; - - newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); - if (!newf) - goto out; - - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - fdt = &newf->fdtab; - fdt->max_fds = NR_OPEN_DEFAULT; - fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - fdt->open_fds = (fd_set *)&newf->open_fds_init; - fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&fdt->rcu); - fdt->next = NULL; - rcu_assign_pointer(newf->fdt, fdt); -out: - return newf; -} - -/* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. - */ -static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) -{ - struct files_struct *newf; - struct file **old_fds, **new_fds; - int open_files, size, i; - struct fdtable *old_fdt, *new_fdt; - - *errorp = -ENOMEM; - newf = alloc_files(); - if (!newf) - goto out; - - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - new_fdt = files_fdtable(newf); - open_files = count_open_files(old_fdt); - - /* - * Check whether we need to allocate a larger fd array and fd set. - * Note: we're not a clone task, so the open count won't change. - */ - if (open_files > new_fdt->max_fds) { - new_fdt->max_fds = 0; - spin_unlock(&oldf->file_lock); - spin_lock(&newf->file_lock); - *errorp = expand_files(newf, open_files-1); - spin_unlock(&newf->file_lock); - if (*errorp < 0) - goto out_release; - new_fdt = files_fdtable(newf); - /* - * Reacquire the oldf lock and a pointer to its fd table - * who knows it may have a new bigger fd table. We need - * the latest pointer. - */ - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - } - - old_fds = old_fdt->fd; - new_fds = new_fdt->fd; - - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); - - for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; - if (f) { - get_file(f); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ - FD_CLR(open_files - i, new_fdt->open_fds); - } - rcu_assign_pointer(*new_fds++, f); - } - spin_unlock(&oldf->file_lock); - - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); - - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); - } - - return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; -} - static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; -- cgit 1.4.1 From eceea0b3df05ed262ae32e0c6340cc7a3626632d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 10 May 2008 10:08:32 -0400 Subject: [PATCH] avoid multiplication overflows and signedness issues for max_fds Limit sysctl_nr_open - we don't want ->max_fds to exceed MAX_INT and we don't want size calculation for ->fd[] to overflow. Signed-off-by: Al Viro --- fs/file.c | 4 ++++ kernel/sysctl.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/fs/file.c b/fs/file.c index 0f705c7cfefe..7b3887e054d0 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,8 @@ struct fdtable_defer { }; int sysctl_nr_open __read_mostly = 1024*1024; +int sysctl_nr_open_min = BITS_PER_LONG; +int sysctl_nr_open_max = 1024 * 1024; /* raised later */ /* * We use this list to defer free fdtables that have vmalloced @@ -405,6 +407,8 @@ void __init files_defer_init(void) int i; for_each_possible_cpu(i) fdtable_defer_list_init(i); + sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; } struct files_struct init_files = { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d7ffdc59816a..29116652dca8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -81,6 +81,7 @@ extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; +extern int sysctl_nr_open_min, sysctl_nr_open_max; /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) @@ -1190,7 +1191,9 @@ static struct ctl_table fs_table[] = { .data = &sysctl_nr_open, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, }, { .ctl_name = FS_DENTRY, -- cgit 1.4.1 From fcaf1eb8685a00a99259e138e403841e984385b0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 May 2008 16:11:48 -0700 Subject: [patch 1/1] audit_send_reply(): fix error-path memory leak Addresses http://bugzilla.kernel.org/show_bug.cgi?id=10663 Reporter: Daniel Marjamki Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/audit.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b7d3709cc452..e8692a5748c2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -572,16 +572,17 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) - return; + goto out; reply->pid = pid; reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); - if (IS_ERR(tsk)) { - kfree(reply); - kfree_skb(skb); - } + if (!IS_ERR(tsk)) + return; + kfree_skb(skb); +out: + kfree(reply); } /* -- cgit 1.4.1 From 6793a051fb9311f0f1ab7eafc5a9e69b8a1bd8d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 May 2008 17:10:12 -0700 Subject: [PATCH] list_for_each_rcu must die: audit All uses of list_for_each_rcu() can be profitably replaced by the easier-to-use list_for_each_entry_rcu(). This patch makes this change for the Audit system, in preparation for removing the list_for_each_rcu() API entirely. This time with well-formed SOB. Signed-off-by: Paul E. McKenney Signed-off-by: Al Viro --- kernel/audit_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 9ef5e0aacc3c..f7921a2ecf16 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -172,10 +172,9 @@ static void insert_hash(struct audit_chunk *chunk) struct audit_chunk *audit_tree_lookup(const struct inode *inode) { struct list_head *list = chunk_hash(inode); - struct list_head *pos; + struct audit_chunk *p; - list_for_each_rcu(pos, list) { - struct audit_chunk *p = container_of(pos, struct audit_chunk, hash); + list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { get_inotify_watch(&p->watch); return p; -- cgit 1.4.1 From c4ea6fcf5a192dbba54666f308bdace1c278e0c1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 May 2008 16:27:29 -0700 Subject: module loading ELF handling: use SELFMAG instead of numeric constant Signed-off-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f5e9491ef7ac..e6daf9a320a7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1780,7 +1780,7 @@ static struct module *load_module(void __user *umod, /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) || hdr->e_shentsize != sizeof(*sechdrs)) { -- cgit 1.4.1 From 34e4e2fef4c7a2f7699b3d25e48d871d3ac4c3e7 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Tue, 20 May 2008 13:59:48 +0400 Subject: modules: proper cleanup of kobject without CONFIG_SYSFS kobject: '' (ffffffffa0104050): is not initialized, yet kobject_put() is being called. ------------[ cut here ]------------ WARNING: at /home/den/src/linux-netns26/lib/kobject.c:583 kobject_put+0x53/0x55() Modules linked in: ipv6 nfsd lockd nfs_acl auth_rpcgss sunrpc exportfs ide_cd_mod cdrom button [last unloaded: pktgen] comm: rmmod Tainted: G W 2.6.26-rc3 #585 Call Trace: [] warn_on_slowpath+0x58/0x7a [] ? printk+0x67/0x69 [] ? printk+0x67/0x69 [] kobject_put+0x53/0x55 [] free_module+0x87/0xfa [] sys_delete_module+0x178/0x1e1 [] ? lockdep_sys_exit_thunk+0x35/0x67 [] ? trace_hardirqs_on_thunk+0x35/0x3a [] system_call_after_swapgs+0x7b/0x80 ---[ end trace 8f5aafa7f6406cf8 ]--- mod->mkobj.kobj is not initialized without CONFIG_SYSFS. Do not call kobject_put in this case. Signed-off-by: Denis V. Lunev Cc: Rusty Russell Cc: Kay Sievers Signed-off-by: Rusty Russell --- kernel/module.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e6daf9a320a7..5f80478b746d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1337,7 +1337,19 @@ out_unreg: kobject_put(&mod->mkobj.kobj); return err; } -#endif + +static void mod_sysfs_fini(struct module *mod) +{ + kobject_put(&mod->mkobj.kobj); +} + +#else /* CONFIG_SYSFS */ + +static void mod_sysfs_fini(struct module *mod) +{ +} + +#endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { @@ -1345,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod) module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); kobject_put(mod->holders_dir); - kobject_put(&mod->mkobj.kobj); + mod_sysfs_fini(mod); } /* -- cgit 1.4.1 From 3401a61e16a5b852d4e353c8850c857105a67a9c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 8 May 2008 15:20:38 +0200 Subject: stop_machine: make stop_machine_run more virtualization friendly On kvm I have seen some rare hangs in stop_machine when I used more guest cpus than hosts cpus. e.g. 32 guest cpus on 1 host cpu triggered the hang quite often. I could also reproduce the problem on a 4 way z/VM host with a 64 way guest. It turned out that the guest was consuming all available cpus mostly for spinning on scheduler locks like rq->lock. This is expected as the threads are calling yield all the time. The problem is now, that the host scheduling decisings together with the guest scheduling decisions and spinlocks not being fair managed to create an interesting scenario similar to a live lock. (Sometimes the hang resolved itself after some minutes) Changing stop_machine to yield the cpu to the hypervisor when yielding inside the guest fixed the problem for me. While I am not completely happy with this patch, I think it causes no harm and it really improves the situation for me. I used cpu_relax for yielding to the hypervisor, does that work on all architectures? p.s.: If you want to reproduce the problem, cpu hotplug and kprobes use stop_machine_run and both triggered the problem after some retries. Signed-off-by: Christian Borntraeger CC: Ingo Molnar Signed-off-by: Rusty Russell --- kernel/stop_machine.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0101aeef7ed7..b7350bbfb076 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -62,8 +62,7 @@ static int stopmachine(void *cpu) * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) yield(); - else - cpu_relax(); + cpu_relax(); } /* Ack: we are exiting. */ @@ -106,8 +105,10 @@ static int stop_machine(void) } /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) + while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { yield(); + cpu_relax(); + } /* If some failed, kill them all. */ if (ret < 0) { -- cgit 1.4.1 From da7978b0348d497688541e2d2f5739aa2a2c334f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 May 2008 13:04:41 -0700 Subject: signals: fix sigqueue_free() vs __exit_signal() race __exit_signal() does flush_sigqueue(tsk->pending) outside of ->siglock. This can race with another thread doing sigqueue_free(), we can free the same SIGQUEUE_PREALLOC sigqueue twice or corrupt the pending->list. Note that even sys_exit_group() can trigger this race, not only sys_timer_delete(). Move the callsite of flush_sigqueue(tsk->pending) under ->siglock. This patch doesn't touch flush_sigqueue(->shared_pending) below, it is called when there are no other threads which can play with signals, and sigqueue_free() can't be used outside of our thread group. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 ++++++- kernel/signal.c | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 1510f78a0ffa..8f6185e69b69 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -126,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk); + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); @@ -133,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk) __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); diff --git a/kernel/signal.c b/kernel/signal.c index 72bb4f51f963..12ffea7c201d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1242,7 +1242,8 @@ void sigqueue_free(struct sigqueue *q) /* * If the signal is still pending remove it from the * pending queue. We must hold ->siglock while testing - * q->list to serialize with collect_signal(). + * q->list to serialize with collect_signal() or with + * __exit_signal()->flush_sigqueue(). */ spin_lock_irqsave(lock, flags); if (!list_empty(&q->list)) -- cgit 1.4.1 From 7b26655f6208fdefa9ab0adc016116324f8d4ba8 Mon Sep 17 00:00:00 2001 From: Shi Weihua Date: Fri, 23 May 2008 13:04:59 -0700 Subject: sys_prctl(): fix return of uninitialized value If none of the switch cases match, the PR_SET_PDEATHSIG and PR_SET_DUMPABLE cases of the switch statement will never write to local variable `error'. Signed-off-by: Shi Weihua Cc: Andrew G. Morgan Acked-by: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 895d2d4c9493..14e97282eb6c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1652,7 +1652,7 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long uninitialized_var(error); + long error = 0; if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; @@ -1701,9 +1701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = PR_TIMING_STATISTICAL; break; case PR_SET_TIMING: - if (arg2 == PR_TIMING_STATISTICAL) - error = 0; - else + if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; break; -- cgit 1.4.1 From 5c02b575780d0d785815a1e7b79a98edddee895a Mon Sep 17 00:00:00 2001 From: Cedric Le Goater Date: Fri, 23 May 2008 13:05:02 -0700 Subject: cgroups: remove node_ prefix_from ns subsystem This is a slight change in the namespace cgroup subsystem api. The change is that previously when cgroup_clone() was called (currently only from the unshare path in ns_proxy cgroup, you'd get a new group named "node_$pid" whereas now you'll get a group named after just your pid.) The only users who would notice it are those who are using the ns_proxy cgroup subsystem to auto-create cgroups when namespaces are unshared - something of an experimental feature, which I think really needs more complete container/namespace support in order to be useful. I suspect the only users are Cedric and Serge, or maybe a few others on containers@lists.linux-foundation.org. And in fact it would only be noticed by the users who make the assumption about how the name is generated, rather than getting it from the /proc//cgroups file for the process in question. Whether the change is actually needed or not I'm fairly agnostic on, but I guess it is more elegant to just use the pid as the new group name rather than adding a fairly arbitrary "node_" prefix on the front. [menage@google.com: provided changelog] Signed-off-by: Cedric Le Goater Cc: "Paul Menage" Cc: "Serge E. Hallyn" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fbc6fc8949b4..15ac0e1e4f4d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); - snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); + snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid); /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); -- cgit 1.4.1 From c8e85b4f4b9ee23bf0e79bdeb3da274a0f9c663f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 May 2008 20:55:42 +0400 Subject: posix timers: sigqueue_free: don't free sigqueue if it is queued Currently sigqueue_free() removes sigqueue from list, but doesn't cancel the pending signal. This is not consistent, the task should either receive the "full" signal along with siginfo_t, or it shouldn't receive the signal at all. Change sigqueue_free() to clear SIGQUEUE_PREALLOC but leave sigqueue on list if it is queued. This is a user-visible change. If the signal is blocked, it stays queued after sys_timer_delete() until unblocked with the "stale" si_code/si_value, and of course it is still counted wrt RLIMIT_SIGPENDING which also limits the number of posix timers. Signed-off-by: Oleg Nesterov Cc: Austin Clements Cc: Roland McGrath Signed-off-by: Linus Torvalds --- kernel/signal.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 12ffea7c201d..2955f6c4f36e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1240,18 +1240,22 @@ void sigqueue_free(struct sigqueue *q) BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); /* - * If the signal is still pending remove it from the - * pending queue. We must hold ->siglock while testing - * q->list to serialize with collect_signal() or with + * We must hold ->siglock while testing q->list + * to serialize with collect_signal() or with * __exit_signal()->flush_sigqueue(). */ spin_lock_irqsave(lock, flags); + q->flags &= ~SIGQUEUE_PREALLOC; + /* + * If it is queued it will be freed when dequeued, + * like the "regular" sigqueue. + */ if (!list_empty(&q->list)) - list_del_init(&q->list); + q = NULL; spin_unlock_irqrestore(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - __sigqueue_free(q); + if (q) + __sigqueue_free(q); } int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) -- cgit 1.4.1 From cbaffba12ce08beb3e80bfda148ee0fa14aac188 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 26 May 2008 20:55:42 +0400 Subject: posix timers: discard SI_TIMER signals on exec Based on Roland's patch. This approach was suggested by Austin Clements from the very beginning, and then by Linus. As Austin pointed out, the execing task can be killed by SI_TIMER signal because exec flushes the signal handlers, but doesn't discard the pending signals generated by posix timers. Perhaps not a bug, but people find this surprising. See http://bugzilla.kernel.org/show_bug.cgi?id=10460 Signed-off-by: Oleg Nesterov Cc: Austin Clements Cc: Roland McGrath Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + include/linux/sched.h | 2 ++ kernel/signal.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3c2ba7ce11d4..9448f1b50b4a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -860,6 +860,7 @@ static int de_thread(struct task_struct *tsk) no_thread_group: exit_itimers(sig); + flush_itimer_signals(); if (leader) release_task(leader); diff --git a/include/linux/sched.h b/include/linux/sched.h index 5395a6176f4b..3e05e5474749 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1848,7 +1848,9 @@ extern void exit_thread(void); extern void exit_files(struct task_struct *); extern void __cleanup_signal(struct signal_struct *); extern void __cleanup_sighand(struct sighand_struct *); + extern void exit_itimers(struct signal_struct *); +extern void flush_itimer_signals(void); extern NORET_TYPE void do_group_exit(int); diff --git a/kernel/signal.c b/kernel/signal.c index 2955f6c4f36e..6c0958e52ea7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +static void __flush_itimer_signals(struct sigpending *pending) +{ + sigset_t signal, retain; + struct sigqueue *q, *n; + + signal = pending->signal; + sigemptyset(&retain); + + list_for_each_entry_safe(q, n, &pending->list, list) { + int sig = q->info.si_signo; + + if (likely(q->info.si_code != SI_TIMER)) { + sigaddset(&retain, sig); + } else { + sigdelset(&signal, sig); + list_del_init(&q->list); + __sigqueue_free(q); + } + } + + sigorsets(&pending->signal, &signal, &retain); +} + +void flush_itimer_signals(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + __flush_itimer_signals(&tsk->pending); + __flush_itimer_signals(&tsk->signal->shared_pending); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); +} + void ignore_signals(struct task_struct *t) { int i; -- cgit 1.4.1