From 3247343118daa73f2b94b7fa565425d1d9f9ac84 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 8 Nov 2013 18:52:21 +0100
Subject: uprobes: Add uprobe_task->dup_xol_work/dup_xol_addr

uprobe_task->vaddr is a bit strange. The generic code uses it only
to pass the additional argument to arch_uprobe_pre_xol(), and since
it is always equal to instruction_pointer() this looks even more
strange.

And both utask->vaddr and and utask->autask have the same scope,
they only have the meaning when the task executes the probed insn
out-of-line, so it is safe to reuse both in UTASK_RUNNING state.

This all means that logically ->vaddr belongs to arch_uprobe_task
and we should probably move it there, arch_uprobe_pre_xol() can
record instruction_pointer() itself.

OTOH, it is also used by uprobe_copy_process() and dup_xol_work()
for another purpose, this doesn't look clean and doesn't allow to
move this member into arch_uprobe_task.

This patch adds the union with 2 anonymous structs into uprobe_task.

The first struct is autask + vaddr, this way we "almost" move vaddr
into autask.

The second struct has 2 new members for uprobe_copy_process() paths:
->dup_xol_addr which can be used instead ->vaddr, and ->dup_xol_work
which can be used to avoid kmalloc() and simplify the code.

Note that this union will likely have another member(s), we need
something like "private_data_for_handlers" so that the tracing
handlers could use it to communicate with call_fetch() methods.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 kernel/events/uprobes.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24b7d6ca871b..df4ef0971266 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1403,12 +1403,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg)
 
 static void dup_xol_work(struct callback_head *work)
 {
-	kfree(work);
-
 	if (current->flags & PF_EXITING)
 		return;
 
-	if (!__create_xol_area(current->utask->vaddr))
+	if (!__create_xol_area(current->utask->dup_xol_addr))
 		uprobe_warn(current, "dup xol area");
 }
 
@@ -1419,7 +1417,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
 {
 	struct uprobe_task *utask = current->utask;
 	struct mm_struct *mm = current->mm;
-	struct callback_head *work;
 	struct xol_area *area;
 
 	t->utask = NULL;
@@ -1441,14 +1438,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
 	if (mm == t->mm)
 		return;
 
-	/* TODO: move it into the union in uprobe_task */
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
-	if (!work)
-		return uprobe_warn(t, "dup xol area");
-
-	t->utask->vaddr = area->vaddr;
-	init_task_work(work, dup_xol_work);
-	task_work_add(t, work, true);
+	t->utask->dup_xol_addr = area->vaddr;
+	init_task_work(&t->utask->dup_xol_work, dup_xol_work);
+	task_work_add(t, &t->utask->dup_xol_work, true);
 }
 
 /*
-- 
cgit 1.4.1


From 803200e24abf0f9ec18631290d26b2185477f3a6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 9 Nov 2013 17:58:54 +0100
Subject: uprobes: Don't assume that arch_uprobe->insn/ixol is
 u8[MAX_UINSN_BYTES]

arch_uprobe should be opaque as much as possible to the generic
code, but currently it assumes that insn/ixol must be u8[] of the
known size. Remove this unnecessary dependency, we can use "&" and
and sizeof() with the same effect.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 kernel/events/uprobes.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index df4ef0971266..445962a72498 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -330,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-	return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+	return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
 }
 
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -529,8 +529,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
 	struct address_space *mapping = uprobe->inode->i_mapping;
 	loff_t offs = uprobe->offset;
-	void *insn = uprobe->arch.insn;
-	int size = MAX_UINSN_BYTES;
+	void *insn = &uprobe->arch.insn;
+	int size = sizeof(uprobe->arch.insn);
 	int len, err = -EIO;
 
 	/* Copy only available bytes, -EIO if nothing was read */
@@ -569,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
 		goto out;
 
 	ret = -ENOTSUPP;
-	if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
+	if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
 		goto out;
 
 	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -1264,7 +1264,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 
 	/* Initialize the slot */
 	copy_to_page(area->page, xol_vaddr,
-			uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
+			&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
 	/*
 	 * We probably need flush_icache_user_range() but it needs vma.
 	 * This should work on supported architectures too.
-- 
cgit 1.4.1


From c912dae60ae6f659455f239298110adc67a5f3e9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 9 Nov 2013 19:49:39 +0100
Subject: uprobes: Cleanup !CONFIG_UPROBES decls, unexport xol_area

1. Don't include asm/uprobes.h unconditionally, we only need
   it if CONFIG_UPROBES.

2. Move the definition of "struct xol_area" into uprobes.c.

   Perhaps we should simply kill struct uprobes_state, it buys
   nothing.

3. Kill the dummy definition of uprobe_get_swbp_addr(), nobody
   except handle_swbp() needs it.

4. Purely cosmetic, but move the decl of uprobe_get_swbp_addr()
   up, close to other __weak helpers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 include/linux/uprobes.h | 31 ++++---------------------------
 kernel/events/uprobes.c | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 2225542624de..e32251e00e62 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -33,10 +33,6 @@ struct mm_struct;
 struct inode;
 struct notifier_block;
 
-#ifdef CONFIG_ARCH_SUPPORTS_UPROBES
-# include <asm/uprobes.h>
-#endif
-
 #define UPROBE_HANDLER_REMOVE		1
 #define UPROBE_HANDLER_MASK		1
 
@@ -61,6 +57,8 @@ struct uprobe_consumer {
 };
 
 #ifdef CONFIG_UPROBES
+#include <asm/uprobes.h>
+
 enum uprobe_task_state {
 	UTASK_RUNNING,
 	UTASK_SSTEP,
@@ -93,24 +91,7 @@ struct uprobe_task {
 	unsigned int			depth;
 };
 
-/*
- * On a breakpoint hit, thread contests for a slot.  It frees the
- * slot after singlestep. Currently a fixed number of slots are
- * allocated.
- */
-struct xol_area {
-	wait_queue_head_t 	wq;		/* if all slots are busy */
-	atomic_t 		slot_count;	/* number of in-use slots */
-	unsigned long 		*bitmap;	/* 0 = free slot */
-	struct page 		*page;
-
-	/*
-	 * We keep the vma's vm_start rather than a pointer to the vma
-	 * itself.  The probed process or a naughty kernel module could make
-	 * the vma go away, and we must handle that reasonably gracefully.
-	 */
-	unsigned long 		vaddr;		/* Page(s) of instruction slots */
-};
+struct xol_area;
 
 struct uprobes_state {
 	struct xol_area		*xol_area;
@@ -120,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern bool __weak is_trap_insn(uprobe_opcode_t *insn);
+extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
@@ -131,7 +113,6 @@ extern void uprobe_end_dup_mmap(void);
 extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm);
 extern void uprobe_free_utask(struct task_struct *t);
 extern void uprobe_copy_process(struct task_struct *t, unsigned long flags);
-extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs);
 extern int uprobe_post_sstep_notifier(struct pt_regs *regs);
 extern int uprobe_pre_sstep_notifier(struct pt_regs *regs);
 extern void uprobe_notify_resume(struct pt_regs *regs);
@@ -187,10 +168,6 @@ static inline bool uprobe_deny_signal(void)
 {
 	return false;
 }
-static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
-{
-	return 0;
-}
 static inline void uprobe_free_utask(struct task_struct *t)
 {
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 445962a72498..51a7f535ff96 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -85,6 +85,25 @@ struct return_instance {
 	struct return_instance	*next;		/* keep as stack */
 };
 
+/*
+ * On a breakpoint hit, thread contests for a slot.  It frees the
+ * slot after singlestep. Currently a fixed number of slots are
+ * allocated.
+ */
+struct xol_area {
+	wait_queue_head_t 	wq;		/* if all slots are busy */
+	atomic_t 		slot_count;	/* number of in-use slots */
+	unsigned long 		*bitmap;	/* 0 = free slot */
+	struct page 		*page;
+
+	/*
+	 * We keep the vma's vm_start rather than a pointer to the vma
+	 * itself.  The probed process or a naughty kernel module could make
+	 * the vma go away, and we must handle that reasonably gracefully.
+	 */
+	unsigned long 		vaddr;		/* Page(s) of instruction slots */
+};
+
 /*
  * valid_vma: Verify if the specified vma is an executable vma
  * Relax restrictions while unregistering: vm_flags might have
-- 
cgit 1.4.1


From ad439356ae5ae7688b39f1107fd5b874850fec18 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 19 Nov 2013 17:20:21 +0100
Subject: uprobes: Document xol_area and arch_uprobe->insn/ixol

Document xol_area and arch_uprobe.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 kernel/events/uprobes.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 51a7f535ff96..b886a5e7d4ff 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -73,6 +73,17 @@ struct uprobe {
 	struct inode		*inode;		/* Also hold a ref to inode */
 	loff_t			offset;
 	unsigned long		flags;
+
+	/*
+	 * The generic code assumes that it has two members of unknown type
+	 * owned by the arch-specific code:
+	 *
+	 * 	insn -	copy_insn() saves the original instruction here for
+	 *		arch_uprobe_analyze_insn().
+	 *
+	 *	ixol -	potentially modified instruction to execute out of
+	 *		line, copied to xol_area by xol_get_insn_slot().
+	 */
 	struct arch_uprobe	arch;
 };
 
@@ -86,6 +97,10 @@ struct return_instance {
 };
 
 /*
+ * Execute out of line area: anonymous executable mapping installed
+ * by the probed task to execute the copy of the original instruction
+ * mangled by set_swbp().
+ *
  * On a breakpoint hit, thread contests for a slot.  It frees the
  * slot after singlestep. Currently a fixed number of slots are
  * allocated.
-- 
cgit 1.4.1


From 71ad88efebbcde374bddf904b96f3a7fc82d45d4 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 12 Nov 2013 17:58:48 +0100
Subject: perf: Add active_entry list head to struct perf_event

This patch adds a new field to the struct perf_event.
It is intended to be used to chain events which are
active (enabled). It helps in the hardware layer
for PMUs which do not have actual counter restrictions, i.e.,
free running read-only counters. Active events are chained
as opposed to being tracked via the counter they use.

To save space we use a union with hlist_entry as both
are mutually exclusive (suggested by Jiri Olsa).

Signed-off-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1384275531-10892-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 5 ++++-
 kernel/events/core.c       | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2e069d1288df..8f4a70f2eca8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -319,7 +319,10 @@ struct perf_event {
 	 */
 	struct list_head		migrate_entry;
 
-	struct hlist_node		hlist_entry;
+	union {
+		struct hlist_node	hlist_entry;
+		struct list_head	active_entry;
+	};
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72348dc192c1..403b781daafb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6655,6 +6655,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
+	INIT_LIST_HEAD(&event->active_entry);
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
-- 
cgit 1.4.1


From c7f2e3cd6c1f4932ccc4135d050eae3f7c7aef63 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 25 Nov 2013 11:49:10 +0100
Subject: perf: Optimize ring-buffer write by depending on control dependencies

Remove a full barrier from the ring-buffer write path by relying on
a control dependency to order a LOAD -> STORE scenario.

Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-8alv40z6ikk57jzbaobnxrjl@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8b168af135b..146a5792b1d2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -61,19 +61,20 @@ again:
 	 *
 	 *   kernel				user
 	 *
-	 *   READ ->data_tail			READ ->data_head
-	 *   smp_mb()	(A)			smp_rmb()	(C)
-	 *   WRITE $data			READ $data
-	 *   smp_wmb()	(B)			smp_mb()	(D)
-	 *   STORE ->data_head			WRITE ->data_tail
+	 *   if (LOAD ->data_tail) {		LOAD ->data_head
+	 *			(A)		smp_rmb()	(C)
+	 *	STORE $data			LOAD $data
+	 *	smp_wmb()	(B)		smp_mb()	(D)
+	 *	STORE ->data_head		STORE ->data_tail
+	 *   }
 	 *
 	 * Where A pairs with D, and B pairs with C.
 	 *
-	 * I don't think A needs to be a full barrier because we won't in fact
-	 * write data until we see the store from userspace. So we simply don't
-	 * issue the data WRITE until we observe it. Be conservative for now.
+	 * In our case (A) is a control dependency that separates the load of
+	 * the ->data_tail and the stores of $data. In case ->data_tail
+	 * indicates there is no room in the buffer to store $data we do not.
 	 *
-	 * OTOH, D needs to be a full barrier since it separates the data READ
+	 * D needs to be a full barrier since it separates the data READ
 	 * from the tail WRITE.
 	 *
 	 * For B a WMB is sufficient since it separates two WRITEs, and for C
@@ -81,7 +82,7 @@ again:
 	 *
 	 * See perf_output_begin().
 	 */
-	smp_wmb();
+	smp_wmb(); /* B, matches C */
 	rb->user_page->data_head = head;
 
 	/*
@@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle,
 		if (!rb->overwrite &&
 		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
 			goto fail;
+
+		/*
+		 * The above forms a control dependency barrier separating the
+		 * @tail load above from the data stores below. Since the @tail
+		 * load is required to compute the branch to fail below.
+		 *
+		 * A, matches D; the full memory barrier userspace SHOULD issue
+		 * after reading the data and before storing the new tail
+		 * position.
+		 *
+		 * See perf_output_put_handle().
+		 */
+
 		head += size;
 	} while (local_cmpxchg(&rb->head, offset, head) != offset);
 
 	/*
-	 * Separate the userpage->tail read from the data stores below.
-	 * Matches the MB userspace SHOULD issue after reading the data
-	 * and before storing the new tail position.
-	 *
-	 * See perf_output_put_handle().
+	 * We rely on the implied barrier() by local_cmpxchg() to ensure
+	 * none of the data stores below can be lifted up by the compiler.
 	 */
-	smp_mb();
 
 	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
 		local_add(rb->watermark, &rb->wakeup);
-- 
cgit 1.4.1


From bad7192b842c83e580747ca57104dd51fe08c223 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 Nov 2013 13:54:38 +0000
Subject: perf: Fix PERF_EVENT_IOC_PERIOD to force-reset the period

Vince Weaver reports that, on all architectures apart from ARM,
PERF_EVENT_IOC_PERIOD doesn't actually update the period until the next
event fires. This is counter-intuitive behaviour and is better dealt
with in the core code.

This patch ensures that the period is forcefully reset when dealing with
such a request in the core code. A subsequent patch removes the
equivalent hack from the ARM back-end.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1385560479-11014-1-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 403b781daafb..89d34f9bb8cb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3527,7 +3527,7 @@ static void perf_event_for_each(struct perf_event *event,
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
 	struct perf_event_context *ctx = event->ctx;
-	int ret = 0;
+	int ret = 0, active;
 	u64 value;
 
 	if (!is_sampling_event(event))
@@ -3551,6 +3551,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 		event->attr.sample_period = value;
 		event->hw.sample_period = value;
 	}
+
+	active = (event->state == PERF_EVENT_STATE_ACTIVE);
+	if (active) {
+		perf_pmu_disable(ctx->pmu);
+		event->pmu->stop(event, PERF_EF_UPDATE);
+	}
+
+	local64_set(&event->hw.period_left, 0);
+
+	if (active) {
+		event->pmu->start(event, PERF_EF_RELOAD);
+		perf_pmu_enable(ctx->pmu);
+	}
+
 unlock:
 	raw_spin_unlock_irq(&ctx->lock);
 
-- 
cgit 1.4.1


From f3ae75de98c4bac145a87d830c156c96f9414022 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 8 Jan 2014 11:15:52 +0100
Subject: perf/x86: Fix active_entry initialization

This patch fixes a problem with the initialization of the
struct perf_event active_entry field. It is defined inside
an anonymous union and was initialized in perf_event_alloc()
using INIT_LIST_HEAD(). However at that time, we do not know
whether the event is going to use active_entry or hlist_entry (SW).
Or at last, we don't want to make that determination there.
The problem is that hlist and list_head are not initialized
the same way. One is okay with NULL (from kzmalloc), the other
needs to pointers to point to self.

This patch resolves this problem by dropping the union.
This will avoid problems later on, if someone starts using
active_entry or hlist_entry without verifying that they
actually overlap. This also solves the initialization
problem.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: ak@linux.intel.com
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: zheng.z.yan@intel.com
Cc: bp@alien8.de
Cc: vincent.weaver@maine.edu
Cc: maria.n.dimakopoulou@gmail.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1389176153-3128-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 6 ++----
 kernel/events/core.c       | 2 ++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8f4a70f2eca8..e56b07f5c9b6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -319,10 +319,8 @@ struct perf_event {
 	 */
 	struct list_head		migrate_entry;
 
-	union {
-		struct hlist_node	hlist_entry;
-		struct list_head	active_entry;
-	};
+	struct hlist_node		hlist_entry;
+	struct list_head		active_entry;
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 89d34f9bb8cb..c3b6c2799f34 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6670,6 +6670,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_HLIST_NODE(&event->hlist_entry);
+
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
-- 
cgit 1.4.1


From a21b0b354d4ac39be691f51c53562e2c24443d9e Mon Sep 17 00:00:00 2001
From: Yann Droneaud <ydroneaud@opteya.com>
Date: Sun, 5 Jan 2014 21:36:33 +0100
Subject: perf: Introduce a flag to enable close-on-exec in perf_event_open()

Unlike recent modern userspace API such as:

  epoll_create1 (EPOLL_CLOEXEC), eventfd (EFD_CLOEXEC),
  fanotify_init (FAN_CLOEXEC), inotify_init1 (IN_CLOEXEC),
  signalfd (SFD_CLOEXEC), timerfd_create (TFD_CLOEXEC),
  or the venerable general purpose open (O_CLOEXEC),

perf_event_open() syscall lack a flag to atomically set FD_CLOEXEC
(eg. close-on-exec) flag on file descriptor it returns to userspace.

The present patch adds a PERF_FLAG_FD_CLOEXEC flag to allow
perf_event_open() syscall to atomically set close-on-exec.

Having this flag will enable userspace to remove the file descriptor
from the list of file descriptors being inherited across exec,
without the need to call fcntl(fd, F_SETFD, FD_CLOEXEC) and the
associated race condition between the current thread and another
thread calling fork(2) then execve(2).

Links:

 - Secure File Descriptor Handling (Ulrich Drepper, 2008)
   http://udrepper.livejournal.com/20407.html

 - Excuse me son, but your code is leaking !!! (Dan Walsh, March 2012)
   http://danwalsh.livejournal.com/53603.html

 - Notes in DMA buffer sharing: leak and security hole
   http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/dma-buf-sharing.txt?id=v3.13-rc3#n428

Signed-off-by: Yann Droneaud <ydroneaud@opteya.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/8c03f54e1598b1727c19706f3af03f98685d9fe6.1388952061.git.ydroneaud@opteya.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/core.c            | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e1802d6153ae..ca018b4085c6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -724,6 +724,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_FD_NO_GROUP		(1U << 0)
 #define PERF_FLAG_FD_OUTPUT		(1U << 1)
 #define PERF_FLAG_PID_CGROUP		(1U << 2) /* pid=cgroup id, per-cpu mode only */
+#define PERF_FLAG_FD_CLOEXEC		(1U << 3) /* O_CLOEXEC */
 
 union perf_mem_data_src {
 	__u64 val;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c3b6c2799f34..5c8726473006 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 		       PERF_FLAG_FD_OUTPUT  |\
-		       PERF_FLAG_PID_CGROUP)
+		       PERF_FLAG_PID_CGROUP |\
+		       PERF_FLAG_FD_CLOEXEC)
 
 /*
  * branch priv levels that need permission checks
@@ -6982,6 +6983,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	int event_fd;
 	int move_group = 0;
 	int err;
+	int f_flags = O_RDWR;
 
 	/* for future expandability... */
 	if (flags & ~PERF_FLAG_ALL)
@@ -7010,7 +7012,10 @@ SYSCALL_DEFINE5(perf_event_open,
 	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
 		return -EINVAL;
 
-	event_fd = get_unused_fd();
+	if (flags & PERF_FLAG_FD_CLOEXEC)
+		f_flags |= O_CLOEXEC;
+
+	event_fd = get_unused_fd_flags(f_flags);
 	if (event_fd < 0)
 		return event_fd;
 
@@ -7132,7 +7137,8 @@ SYSCALL_DEFINE5(perf_event_open,
 			goto err_context;
 	}
 
-	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
+	event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
+					f_flags);
 	if (IS_ERR(event_file)) {
 		err = PTR_ERR(event_file);
 		goto err_context;
-- 
cgit 1.4.1