From 3247343118daa73f2b94b7fa565425d1d9f9ac84 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 8 Nov 2013 18:52:21 +0100 Subject: uprobes: Add uprobe_task->dup_xol_work/dup_xol_addr uprobe_task->vaddr is a bit strange. The generic code uses it only to pass the additional argument to arch_uprobe_pre_xol(), and since it is always equal to instruction_pointer() this looks even more strange. And both utask->vaddr and and utask->autask have the same scope, they only have the meaning when the task executes the probed insn out-of-line, so it is safe to reuse both in UTASK_RUNNING state. This all means that logically ->vaddr belongs to arch_uprobe_task and we should probably move it there, arch_uprobe_pre_xol() can record instruction_pointer() itself. OTOH, it is also used by uprobe_copy_process() and dup_xol_work() for another purpose, this doesn't look clean and doesn't allow to move this member into arch_uprobe_task. This patch adds the union with 2 anonymous structs into uprobe_task. The first struct is autask + vaddr, this way we "almost" move vaddr into autask. The second struct has 2 new members for uprobe_copy_process() paths: ->dup_xol_addr which can be used instead ->vaddr, and ->dup_xol_work which can be used to avoid kmalloc() and simplify the code. Note that this union will likely have another member(s), we need something like "private_data_for_handlers" so that the tracing handlers could use it to communicate with call_fetch() methods. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 24b7d6ca871b..df4ef0971266 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1403,12 +1403,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg) static void dup_xol_work(struct callback_head *work) { - kfree(work); - if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->vaddr)) + if (!__create_xol_area(current->utask->dup_xol_addr)) uprobe_warn(current, "dup xol area"); } @@ -1419,7 +1417,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; - struct callback_head *work; struct xol_area *area; t->utask = NULL; @@ -1441,14 +1438,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) if (mm == t->mm) return; - /* TODO: move it into the union in uprobe_task */ - work = kmalloc(sizeof(*work), GFP_KERNEL); - if (!work) - return uprobe_warn(t, "dup xol area"); - - t->utask->vaddr = area->vaddr; - init_task_work(work, dup_xol_work); - task_work_add(t, work, true); + t->utask->dup_xol_addr = area->vaddr; + init_task_work(&t->utask->dup_xol_work, dup_xol_work); + task_work_add(t, &t->utask->dup_xol_work, true); } /* -- cgit 1.4.1 From 803200e24abf0f9ec18631290d26b2185477f3a6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Nov 2013 17:58:54 +0100 Subject: uprobes: Don't assume that arch_uprobe->insn/ixol is u8[MAX_UINSN_BYTES] arch_uprobe should be opaque as much as possible to the generic code, but currently it assumes that insn/ixol must be u8[] of the known size. Remove this unnecessary dependency, we can use "&" and and sizeof() with the same effect. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index df4ef0971266..445962a72498 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -330,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -529,8 +529,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; - void *insn = uprobe->arch.insn; - int size = MAX_UINSN_BYTES; + void *insn = &uprobe->arch.insn; + int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ @@ -569,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -1264,7 +1264,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) /* Initialize the slot */ copy_to_page(area->page, xol_vaddr, - uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. -- cgit 1.4.1 From c912dae60ae6f659455f239298110adc67a5f3e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 9 Nov 2013 19:49:39 +0100 Subject: uprobes: Cleanup !CONFIG_UPROBES decls, unexport xol_area 1. Don't include asm/uprobes.h unconditionally, we only need it if CONFIG_UPROBES. 2. Move the definition of "struct xol_area" into uprobes.c. Perhaps we should simply kill struct uprobes_state, it buys nothing. 3. Kill the dummy definition of uprobe_get_swbp_addr(), nobody except handle_swbp() needs it. 4. Purely cosmetic, but move the decl of uprobe_get_swbp_addr() up, close to other __weak helpers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 31 ++++--------------------------- kernel/events/uprobes.c | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2225542624de..e32251e00e62 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -33,10 +33,6 @@ struct mm_struct; struct inode; struct notifier_block; -#ifdef CONFIG_ARCH_SUPPORTS_UPROBES -# include -#endif - #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 @@ -61,6 +57,8 @@ struct uprobe_consumer { }; #ifdef CONFIG_UPROBES +#include + enum uprobe_task_state { UTASK_RUNNING, UTASK_SSTEP, @@ -93,24 +91,7 @@ struct uprobe_task { unsigned int depth; }; -/* - * On a breakpoint hit, thread contests for a slot. It frees the - * slot after singlestep. Currently a fixed number of slots are - * allocated. - */ -struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *page; - - /* - * We keep the vma's vm_start rather than a pointer to the vma - * itself. The probed process or a naughty kernel module could make - * the vma go away, and we must handle that reasonably gracefully. - */ - unsigned long vaddr; /* Page(s) of instruction slots */ -}; +struct xol_area; struct uprobes_state { struct xol_area *xol_area; @@ -120,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern bool __weak is_trap_insn(uprobe_opcode_t *insn); +extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -131,7 +113,6 @@ extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); -extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -187,10 +168,6 @@ static inline bool uprobe_deny_signal(void) { return false; } -static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) -{ - return 0; -} static inline void uprobe_free_utask(struct task_struct *t) { } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 445962a72498..51a7f535ff96 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -85,6 +85,25 @@ struct return_instance { struct return_instance *next; /* keep as stack */ }; +/* + * On a breakpoint hit, thread contests for a slot. It frees the + * slot after singlestep. Currently a fixed number of slots are + * allocated. + */ +struct xol_area { + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct page *page; + + /* + * We keep the vma's vm_start rather than a pointer to the vma + * itself. The probed process or a naughty kernel module could make + * the vma go away, and we must handle that reasonably gracefully. + */ + unsigned long vaddr; /* Page(s) of instruction slots */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have -- cgit 1.4.1 From ad439356ae5ae7688b39f1107fd5b874850fec18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 Nov 2013 17:20:21 +0100 Subject: uprobes: Document xol_area and arch_uprobe->insn/ixol Document xol_area and arch_uprobe. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 51a7f535ff96..b886a5e7d4ff 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,17 @@ struct uprobe { struct inode *inode; /* Also hold a ref to inode */ loff_t offset; unsigned long flags; + + /* + * The generic code assumes that it has two members of unknown type + * owned by the arch-specific code: + * + * insn - copy_insn() saves the original instruction here for + * arch_uprobe_analyze_insn(). + * + * ixol - potentially modified instruction to execute out of + * line, copied to xol_area by xol_get_insn_slot(). + */ struct arch_uprobe arch; }; @@ -86,6 +97,10 @@ struct return_instance { }; /* + * Execute out of line area: anonymous executable mapping installed + * by the probed task to execute the copy of the original instruction + * mangled by set_swbp(). + * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. -- cgit 1.4.1 From 71ad88efebbcde374bddf904b96f3a7fc82d45d4 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 12 Nov 2013 17:58:48 +0100 Subject: perf: Add active_entry list head to struct perf_event This patch adds a new field to the struct perf_event. It is intended to be used to chain events which are active (enabled). It helps in the hardware layer for PMUs which do not have actual counter restrictions, i.e., free running read-only counters. Active events are chained as opposed to being tracked via the counter they use. To save space we use a union with hlist_entry as both are mutually exclusive (suggested by Jiri Olsa). Signed-off-by: Stephane Eranian Reviewed-by: Andi Kleen Signed-off-by: Peter Zijlstra Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: maria.n.dimakopoulou@gmail.com Link: http://lkml.kernel.org/r/1384275531-10892-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 ++++- kernel/events/core.c | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2e069d1288df..8f4a70f2eca8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -319,7 +319,10 @@ struct perf_event { */ struct list_head migrate_entry; - struct hlist_node hlist_entry; + union { + struct hlist_node hlist_entry; + struct list_head active_entry; + }; int nr_siblings; int group_flags; struct perf_event *group_leader; diff --git a/kernel/events/core.c b/kernel/events/core.c index 72348dc192c1..403b781daafb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6655,6 +6655,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); + INIT_LIST_HEAD(&event->active_entry); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); -- cgit 1.4.1 From c7f2e3cd6c1f4932ccc4135d050eae3f7c7aef63 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Nov 2013 11:49:10 +0100 Subject: perf: Optimize ring-buffer write by depending on control dependencies Remove a full barrier from the ring-buffer write path by relying on a control dependency to order a LOAD -> STORE scenario. Cc: "Paul E. McKenney" Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-8alv40z6ikk57jzbaobnxrjl@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e8b168af135b..146a5792b1d2 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -61,19 +61,20 @@ again: * * kernel user * - * READ ->data_tail READ ->data_head - * smp_mb() (A) smp_rmb() (C) - * WRITE $data READ $data - * smp_wmb() (B) smp_mb() (D) - * STORE ->data_head WRITE ->data_tail + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } * * Where A pairs with D, and B pairs with C. * - * I don't think A needs to be a full barrier because we won't in fact - * write data until we see the store from userspace. So we simply don't - * issue the data WRITE until we observe it. Be conservative for now. + * In our case (A) is a control dependency that separates the load of + * the ->data_tail and the stores of $data. In case ->data_tail + * indicates there is no room in the buffer to store $data we do not. * - * OTOH, D needs to be a full barrier since it separates the data READ + * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C @@ -81,7 +82,7 @@ again: * * See perf_output_begin(). */ - smp_wmb(); + smp_wmb(); /* B, matches C */ rb->user_page->data_head = head; /* @@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle, if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) goto fail; + + /* + * The above forms a control dependency barrier separating the + * @tail load above from the data stores below. Since the @tail + * load is required to compute the branch to fail below. + * + * A, matches D; the full memory barrier userspace SHOULD issue + * after reading the data and before storing the new tail + * position. + * + * See perf_output_put_handle(). + */ + head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); /* - * Separate the userpage->tail read from the data stores below. - * Matches the MB userspace SHOULD issue after reading the data - * and before storing the new tail position. - * - * See perf_output_put_handle(). + * We rely on the implied barrier() by local_cmpxchg() to ensure + * none of the data stores below can be lifted up by the compiler. */ - smp_mb(); if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); -- cgit 1.4.1 From bad7192b842c83e580747ca57104dd51fe08c223 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Nov 2013 13:54:38 +0000 Subject: perf: Fix PERF_EVENT_IOC_PERIOD to force-reset the period Vince Weaver reports that, on all architectures apart from ARM, PERF_EVENT_IOC_PERIOD doesn't actually update the period until the next event fires. This is counter-intuitive behaviour and is better dealt with in the core code. This patch ensures that the period is forcefully reset when dealing with such a request in the core code. A subsequent patch removes the equivalent hack from the ARM back-end. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Signed-off-by: Will Deacon Link: http://lkml.kernel.org/r/1385560479-11014-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 403b781daafb..89d34f9bb8cb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3527,7 +3527,7 @@ static void perf_event_for_each(struct perf_event *event, static int perf_event_period(struct perf_event *event, u64 __user *arg) { struct perf_event_context *ctx = event->ctx; - int ret = 0; + int ret = 0, active; u64 value; if (!is_sampling_event(event)) @@ -3551,6 +3551,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->attr.sample_period = value; event->hw.sample_period = value; } + + active = (event->state == PERF_EVENT_STATE_ACTIVE); + if (active) { + perf_pmu_disable(ctx->pmu); + event->pmu->stop(event, PERF_EF_UPDATE); + } + + local64_set(&event->hw.period_left, 0); + + if (active) { + event->pmu->start(event, PERF_EF_RELOAD); + perf_pmu_enable(ctx->pmu); + } + unlock: raw_spin_unlock_irq(&ctx->lock); -- cgit 1.4.1 From f3ae75de98c4bac145a87d830c156c96f9414022 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 8 Jan 2014 11:15:52 +0100 Subject: perf/x86: Fix active_entry initialization This patch fixes a problem with the initialization of the struct perf_event active_entry field. It is defined inside an anonymous union and was initialized in perf_event_alloc() using INIT_LIST_HEAD(). However at that time, we do not know whether the event is going to use active_entry or hlist_entry (SW). Or at last, we don't want to make that determination there. The problem is that hlist and list_head are not initialized the same way. One is okay with NULL (from kzmalloc), the other needs to pointers to point to self. This patch resolves this problem by dropping the union. This will avoid problems later on, if someone starts using active_entry or hlist_entry without verifying that they actually overlap. This also solves the initialization problem. Signed-off-by: Stephane Eranian Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: zheng.z.yan@intel.com Cc: bp@alien8.de Cc: vincent.weaver@maine.edu Cc: maria.n.dimakopoulou@gmail.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1389176153-3128-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++---- kernel/events/core.c | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8f4a70f2eca8..e56b07f5c9b6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -319,10 +319,8 @@ struct perf_event { */ struct list_head migrate_entry; - union { - struct hlist_node hlist_entry; - struct list_head active_entry; - }; + struct hlist_node hlist_entry; + struct list_head active_entry; int nr_siblings; int group_flags; struct perf_event *group_leader; diff --git a/kernel/events/core.c b/kernel/events/core.c index 89d34f9bb8cb..c3b6c2799f34 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6670,6 +6670,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_HLIST_NODE(&event->hlist_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); -- cgit 1.4.1 From a21b0b354d4ac39be691f51c53562e2c24443d9e Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Sun, 5 Jan 2014 21:36:33 +0100 Subject: perf: Introduce a flag to enable close-on-exec in perf_event_open() Unlike recent modern userspace API such as: epoll_create1 (EPOLL_CLOEXEC), eventfd (EFD_CLOEXEC), fanotify_init (FAN_CLOEXEC), inotify_init1 (IN_CLOEXEC), signalfd (SFD_CLOEXEC), timerfd_create (TFD_CLOEXEC), or the venerable general purpose open (O_CLOEXEC), perf_event_open() syscall lack a flag to atomically set FD_CLOEXEC (eg. close-on-exec) flag on file descriptor it returns to userspace. The present patch adds a PERF_FLAG_FD_CLOEXEC flag to allow perf_event_open() syscall to atomically set close-on-exec. Having this flag will enable userspace to remove the file descriptor from the list of file descriptors being inherited across exec, without the need to call fcntl(fd, F_SETFD, FD_CLOEXEC) and the associated race condition between the current thread and another thread calling fork(2) then execve(2). Links: - Secure File Descriptor Handling (Ulrich Drepper, 2008) http://udrepper.livejournal.com/20407.html - Excuse me son, but your code is leaking !!! (Dan Walsh, March 2012) http://danwalsh.livejournal.com/53603.html - Notes in DMA buffer sharing: leak and security hole http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/dma-buf-sharing.txt?id=v3.13-rc3#n428 Signed-off-by: Yann Droneaud Cc: Arnaldo Carvalho de Melo Cc: Al Viro Cc: Andrew Morton Cc: Paul Mackerras Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/8c03f54e1598b1727c19706f3af03f98685d9fe6.1388952061.git.ydroneaud@opteya.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e1802d6153ae..ca018b4085c6 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -724,6 +724,7 @@ enum perf_callchain_context { #define PERF_FLAG_FD_NO_GROUP (1U << 0) #define PERF_FLAG_FD_OUTPUT (1U << 1) #define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_CLOEXEC (1U << 3) /* O_CLOEXEC */ union perf_mem_data_src { __u64 val; diff --git a/kernel/events/core.c b/kernel/events/core.c index c3b6c2799f34..5c8726473006 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ - PERF_FLAG_PID_CGROUP) + PERF_FLAG_PID_CGROUP |\ + PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks @@ -6982,6 +6983,7 @@ SYSCALL_DEFINE5(perf_event_open, int event_fd; int move_group = 0; int err; + int f_flags = O_RDWR; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -7010,7 +7012,10 @@ SYSCALL_DEFINE5(perf_event_open, if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; - event_fd = get_unused_fd(); + if (flags & PERF_FLAG_FD_CLOEXEC) + f_flags |= O_CLOEXEC; + + event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; @@ -7132,7 +7137,8 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, + f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); goto err_context; -- cgit 1.4.1