summary refs log tree commit diff
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-05-06 04:42:12 +0200
committerIngo Molnar <mingo@kernel.org>2015-05-06 04:42:12 +0200
commit1836ac856e4fb446e48afa4f8cae897d4856b06c (patch)
tree1b5da329b15a09e13189801c82266fb6805a90af
parent5ebe6afaf0057ac3eaeb98defd5456894b446d22 (diff)
parent3698dab1c849c7e1cd440df4fca24baa1973d53b (diff)
downloadlinux-1836ac856e4fb446e48afa4f8cae897d4856b06c.tar.gz
Merge tag 'perf-core-for-mingo-3' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

User visible changes:

  - Improve --filter support for 'perf probe', allowing using its arguments
    on other commands, as --add, --del, etc (Masami Hiramatsu)

  - Show warning when running 'perf kmem stat' on a unsuitable perf.data file,
    i.e. one with events that are not the ones required for the stat variant
    used (Namhyung Kim).

Infrastructure changes:

  - Auxtrace support patches, paving the way to support Intel PT and BTS (Adrian Hunter)

  - hists browser (top, report) refactorings (Namhyung Kim)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Makefile5
-rw-r--r--tools/Makefile20
-rw-r--r--tools/lib/traceevent/event-parse.c6
-rw-r--r--tools/lib/traceevent/event-parse.h1
-rw-r--r--tools/perf/Documentation/callchain-overhead-calculation.txt108
-rw-r--r--tools/perf/Documentation/perf-inject.txt27
-rw-r--r--tools/perf/Documentation/perf-kmem.txt11
-rw-r--r--tools/perf/Documentation/perf-probe.txt11
-rw-r--r--tools/perf/Documentation/perf-record.txt9
-rw-r--r--tools/perf/Documentation/perf-report.txt32
-rw-r--r--tools/perf/Documentation/perf-script.txt37
-rw-r--r--tools/perf/Documentation/perf-top.txt3
-rw-r--r--tools/perf/Documentation/perf-trace.txt2
-rw-r--r--tools/perf/Makefile2
-rw-r--r--tools/perf/Makefile.perf2
-rw-r--r--tools/perf/arch/powerpc/util/Build1
-rw-r--r--tools/perf/arch/powerpc/util/sym-handling.c82
-rw-r--r--tools/perf/bench/numa.c32
-rw-r--r--tools/perf/builtin-buildid-list.c9
-rw-r--r--tools/perf/builtin-inject.c174
-rw-r--r--tools/perf/builtin-kmem.c1015
-rw-r--r--tools/perf/builtin-probe.c167
-rw-r--r--tools/perf/builtin-record.c274
-rw-r--r--tools/perf/builtin-report.c11
-rw-r--r--tools/perf/builtin-script.c74
-rw-r--r--tools/perf/builtin-stat.c146
-rw-r--r--tools/perf/builtin-trace.c7
-rw-r--r--tools/perf/config/Makefile5
-rw-r--r--tools/perf/perf.h5
-rw-r--r--tools/perf/tests/code-reading.c2
-rw-r--r--tools/perf/tests/evsel-roundtrip-name.c4
-rw-r--r--tools/perf/tests/hists_cumulate.c2
-rw-r--r--tools/perf/tests/hists_filter.c4
-rw-r--r--tools/perf/tests/hists_link.c4
-rw-r--r--tools/perf/tests/hists_output.c2
-rw-r--r--tools/perf/tests/keep-tracking.c4
-rw-r--r--tools/perf/tests/make18
-rw-r--r--tools/perf/tests/parse-events.c2
-rw-r--r--tools/perf/tests/perf-time-to-tsc.c2
-rw-r--r--tools/perf/tests/pmu.c3
-rw-r--r--tools/perf/tests/switch-tracking.c8
-rw-r--r--tools/perf/ui/browsers/hists.c633
-rw-r--r--tools/perf/util/Build3
-rw-r--r--tools/perf/util/auxtrace.c1352
-rw-r--r--tools/perf/util/auxtrace.h643
-rw-r--r--tools/perf/util/callchain.h4
-rw-r--r--tools/perf/util/data-convert-bt.c410
-rw-r--r--tools/perf/util/dso.c2
-rw-r--r--tools/perf/util/dso.h3
-rw-r--r--tools/perf/util/event.c42
-rw-r--r--tools/perf/util/event.h70
-rw-r--r--tools/perf/util/evlist.c73
-rw-r--r--tools/perf/util/evlist.h6
-rw-r--r--tools/perf/util/evsel.c1
-rw-r--r--tools/perf/util/header.c37
-rw-r--r--tools/perf/util/header.h1
-rw-r--r--tools/perf/util/hist.c2
-rw-r--r--tools/perf/util/machine.c21
-rw-r--r--tools/perf/util/machine.h4
-rw-r--r--tools/perf/util/map.c5
-rw-r--r--tools/perf/util/map.h3
-rw-r--r--tools/perf/util/parse-events.c194
-rw-r--r--tools/perf/util/parse-events.h36
-rw-r--r--tools/perf/util/parse-events.l41
-rw-r--r--tools/perf/util/parse-events.y48
-rw-r--r--tools/perf/util/parse-options.h4
-rw-r--r--tools/perf/util/pmu.c57
-rw-r--r--tools/perf/util/pmu.h6
-rw-r--r--tools/perf/util/probe-event.c159
-rw-r--r--tools/perf/util/probe-event.h7
-rw-r--r--tools/perf/util/pstack.c7
-rw-r--r--tools/perf/util/pstack.h1
-rw-r--r--tools/perf/util/record.c15
-rw-r--r--tools/perf/util/session.c184
-rw-r--r--tools/perf/util/session.h6
-rw-r--r--tools/perf/util/sort.h38
-rw-r--r--tools/perf/util/strfilter.c107
-rw-r--r--tools/perf/util/strfilter.h35
-rw-r--r--tools/perf/util/symbol-elf.c13
-rw-r--r--tools/perf/util/symbol.c25
-rw-r--r--tools/perf/util/symbol.h12
-rw-r--r--tools/perf/util/tool.h12
82 files changed, 5843 insertions, 812 deletions
diff --git a/Makefile b/Makefile
index 2da553fd7fc3..66a094216cad 100644
--- a/Makefile
+++ b/Makefile
@@ -215,7 +215,6 @@ VPATH		:= $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
 
 export srctree objtree VPATH
 
-
 # SUBARCH tells the usermode build what the underlying arch is.  That is set
 # first, and if a usermode build is happening, the "ARCH=um" on the command
 # line overrides the setting of ARCH below.  If a native build is happening,
@@ -1497,11 +1496,11 @@ image_name:
 # Clear a bunch of variables before executing the submake
 tools/: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(objtree) subdir=tools -C $(src)/tools/
+	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(O) subdir=tools -C $(src)/tools/
 
 tools/%: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(objtree) subdir=tools -C $(src)/tools/ $*
+	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(O) subdir=tools -C $(src)/tools/ $*
 
 # Single targets
 # ---------------------------------------------------------------------------
diff --git a/tools/Makefile b/tools/Makefile
index 9a617adc6675..b35102721cbb 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,3 +1,8 @@
+# Some of the tools (perf) use same make variables
+# as in kernel build.
+export srctree=
+export objtree=
+
 include scripts/Makefile.include
 
 help:
@@ -47,11 +52,16 @@ cgroup firewire hv guest usb virtio vm net: FORCE
 liblockdep: FORCE
 	$(call descend,lib/lockdep)
 
-libapikfs: FORCE
+libapi: FORCE
 	$(call descend,lib/api)
 
-perf: libapikfs FORCE
-	$(call descend,$@)
+# The perf build does not follow the descend function setup,
+# invoking it via it's own make rule.
+PERF_O   = $(if $(O),$(O)/tools/perf,)
+
+perf: FORCE
+	$(Q)mkdir -p $(PERF_O) .
+	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
 
 selftests: FORCE
 	$(call descend,testing/$@)
@@ -97,10 +107,10 @@ cgroup_clean hv_clean firewire_clean lguest_clean usb_clean virtio_clean vm_clea
 liblockdep_clean:
 	$(call descend,lib/lockdep,clean)
 
-libapikfs_clean:
+libapi_clean:
 	$(call descend,lib/api,clean)
 
-perf_clean: libapikfs_clean
+perf_clean:
 	$(call descend,$(@:_clean=),clean)
 
 selftests_clean:
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index e0917c0f5d9f..e29e4f81a40d 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1387,7 +1387,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f
 			do_warning_event(event, "%s: no type found", __func__);
 			goto fail;
 		}
-		field->name = last_token;
+		field->name = field->alias = last_token;
 
 		if (test_type(type, EVENT_OP))
 			goto fail;
@@ -1469,7 +1469,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f
 				size_dynamic = type_size(field->name);
 				free_token(field->name);
 				strcat(field->type, brackets);
-				field->name = token;
+				field->name = field->alias = token;
 				type = read_token(&token);
 			} else {
 				char *new_type;
@@ -6444,6 +6444,8 @@ void pevent_ref(struct pevent *pevent)
 void pevent_free_format_field(struct format_field *field)
 {
 	free(field->type);
+	if (field->alias != field->name)
+		free(field->alias);
 	free(field->name);
 	free(field);
 }
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 86a5839fb048..063b1971eb35 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -191,6 +191,7 @@ struct format_field {
 	struct event_format	*event;
 	char			*type;
 	char			*name;
+	char			*alias;
 	int			offset;
 	int			size;
 	unsigned int		arraylen;
diff --git a/tools/perf/Documentation/callchain-overhead-calculation.txt b/tools/perf/Documentation/callchain-overhead-calculation.txt
new file mode 100644
index 000000000000..1a757927195e
--- /dev/null
+++ b/tools/perf/Documentation/callchain-overhead-calculation.txt
@@ -0,0 +1,108 @@
+Overhead calculation
+--------------------
+The overhead can be shown in two columns as 'Children' and 'Self' when
+perf collects callchains.  The 'self' overhead is simply calculated by
+adding all period values of the entry - usually a function (symbol).
+This is the value that perf shows traditionally and sum of all the
+'self' overhead values should be 100%.
+
+The 'children' overhead is calculated by adding all period values of
+the child functions so that it can show the total overhead of the
+higher level functions even if they don't directly execute much.
+'Children' here means functions that are called from another (parent)
+function.
+
+It might be confusing that the sum of all the 'children' overhead
+values exceeds 100% since each of them is already an accumulation of
+'self' overhead of its child functions.  But with this enabled, users
+can find which function has the most overhead even if samples are
+spread over the children.
+
+Consider the following example; there are three functions like below.
+
+-----------------------
+void foo(void) {
+    /* do something */
+}
+
+void bar(void) {
+    /* do something */
+    foo();
+}
+
+int main(void) {
+    bar()
+    return 0;
+}
+-----------------------
+
+In this case 'foo' is a child of 'bar', and 'bar' is an immediate
+child of 'main' so 'foo' also is a child of 'main'.  In other words,
+'main' is a parent of 'foo' and 'bar', and 'bar' is a parent of 'foo'.
+
+Suppose all samples are recorded in 'foo' and 'bar' only.  When it's
+recorded with callchains the output will show something like below
+in the usual (self-overhead-only) output of perf report:
+
+----------------------------------
+Overhead  Symbol
+........  .....................
+  60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+
+  40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+----------------------------------
+
+When the --children option is enabled, the 'self' overhead values of
+child functions (i.e. 'foo' and 'bar') are added to the parents to
+calculate the 'children' overhead.  In this case the report could be
+displayed as:
+
+-------------------------------------------
+Children      Self  Symbol
+........  ........  ....................
+ 100.00%     0.00%  __libc_start_main
+          |
+          --- __libc_start_main
+
+ 100.00%     0.00%  main
+          |
+          --- main
+              __libc_start_main
+
+ 100.00%    40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+
+  60.00%    60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+-------------------------------------------
+
+In the above output, the 'self' overhead of 'foo' (60%) was add to the
+'children' overhead of 'bar', 'main' and '\_\_libc_start_main'.
+Likewise, the 'self' overhead of 'bar' (40%) was added to the
+'children' overhead of 'main' and '\_\_libc_start_main'.
+
+So '\_\_libc_start_main' and 'main' are shown first since they have
+same (100%) 'children' overhead (even though they have zero 'self'
+overhead) and they are the parents of 'foo' and 'bar'.
+
+Since v3.16 the 'children' overhead is shown by default and the output
+is sorted by its values. The 'children' overhead is disabled by
+specifying --no-children option on the command line or by adding
+'report.children = false' or 'top.children = false' in the perf config
+file.
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index dc7442cf3d7f..b876ae312699 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -44,6 +44,33 @@ OPTIONS
 --kallsyms=<file>::
 	kallsyms pathname
 
+--itrace::
+	Decode Instruction Tracing data, replacing it with synthesized events.
+	Options are:
+
+		i	synthesize instructions events
+		b	synthesize branches events
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		e	synthesize error events
+		d	create a debug log
+		g	synthesize a call chain (use with i or x)
+
+	The default is all events i.e. the same as --itrace=ibxe
+
+	In addition, the period (default 100000) for instructions events
+	can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
index 23219c65c16f..ff0f433b3fce 100644
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -37,7 +37,11 @@ OPTIONS
 
 -s <key[,key2...]>::
 --sort=<key[,key2...]>::
-	Sort the output (default: frag,hit,bytes)
+	Sort the output (default: 'frag,hit,bytes' for slab and 'bytes,hit'
+	for page).  Available sort keys are 'ptr, callsite, bytes, hit,
+	pingpong, frag' for slab and 'page, callsite, bytes, hit, order,
+	migtype, gfp' for page.  This option should be preceded by one of the
+	mode selection options - i.e. --slab, --page, --alloc and/or --caller.
 
 -l <num>::
 --line=<num>::
@@ -52,6 +56,11 @@ OPTIONS
 --page::
 	Analyze page allocator events
 
+--live::
+	Show live page stat.  The perf kmem shows total allocation stat by
+	default, but this option shows live (currently allocated) pages
+	instead.  (This option works with --page option only)
+
 SEE ALSO
 --------
 linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 239609c09f83..a272f2e9a1cf 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -14,11 +14,13 @@ or
 or
 'perf probe' [options] --del='[GROUP:]EVENT' [...]
 or
-'perf probe' --list
+'perf probe' --list[=[GROUP:]EVENT]
 or
 'perf probe' [options] --line='LINE'
 or
 'perf probe' [options] --vars='PROBEPOINT'
+or
+'perf probe' [options] --funcs
 
 DESCRIPTION
 -----------
@@ -64,8 +66,8 @@ OPTIONS
 	classes(e.g. [a-z], [!A-Z]).
 
 -l::
---list::
-	List up current probe events.
+--list[=[GROUP:]EVENT]::
+	List up current probe events. This can also accept filtering patterns of event names.
 
 -L::
 --line=::
@@ -82,9 +84,10 @@ OPTIONS
 	variables.
 
 -F::
---funcs::
+--funcs[=FILTER]::
 	Show available functions in given module or kernel. With -x/--exec,
 	can also list functions in a user space executable / shared library.
+	This also can accept a FILTER rule argument.
 
 --filter=FILTER::
 	(Only for --vars and --funcs) Set filter. FILTER is a combination of glob
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 4847a793de65..57dd57bcef95 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -108,6 +108,8 @@ OPTIONS
 	Number of mmap data pages (must be a power of two) or size
 	specification with appended unit character - B/K/M/G. The
 	size is rounded up to have nearest pages power of two value.
+	Also, by adding a comma, the number of mmap pages for AUX
+	area tracing can be specified.
 
 --group::
 	Put all events in a single event group.  This precedes the --event
@@ -257,6 +259,13 @@ records. See clock_gettime(). In particular CLOCK_MONOTONIC and
 CLOCK_MONOTONIC_RAW are supported, some events might also allow
 CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI.
 
+-S::
+--snapshot::
+Select AUX area tracing Snapshot Mode. This option is valid only with an
+AUX area tracing event. Optionally the number of bytes to capture per
+snapshot can be specified. In Snapshot Mode, trace data is captured only when
+signal SIGUSR2 is received.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 4879cf638824..27190ed06f9c 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -193,6 +193,7 @@ OPTIONS
 	Accumulate callchain of children to parent entry so that then can
 	show up in the output.  The output will have a new "Children" column
 	and will be sorted on the data.  It requires callchains are recorded.
+	See the `overhead calculation' section for more details.
 
 --max-stack::
 	Set the stack depth limit when parsing the callchain, anything
@@ -323,6 +324,37 @@ OPTIONS
 --header-only::
 	Show only perf.data header (forces --stdio).
 
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+		i	synthesize instructions events
+		b	synthesize branches events
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		e	synthesize error events
+		d	create a debug log
+		g	synthesize a call chain (use with i or x)
+
+	The default is all events i.e. the same as --itrace=ibxe
+
+	In addition, the period (default 100000) for instructions events
+	can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
+	To disable decoding entirely, use --no-itrace.
+
+
+include::callchain-overhead-calculation.txt[]
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 79445750fcb3..c82df572fac2 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -115,7 +115,8 @@ OPTIONS
 -f::
 --fields::
         Comma separated list of fields to print. Options are:
-        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline, period.
+        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
+	srcline, period, flags.
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -165,6 +166,12 @@ OPTIONS
 
 	At this point usage is displayed, and perf-script exits.
 
+	The flags field is synthesized and may have a value when Instruction
+	Trace decoding. The flags are "bcrosyiABEx" which stand for branch,
+	call, return, conditional, system, asynchronous, interrupt,
+	transaction abort, trace begin, trace end, and in transaction,
+	respectively.
+
 	Finally, a user may not set fields to none for all event types.
 	i.e., -f "" is not allowed.
 
@@ -221,6 +228,34 @@ OPTIONS
 --header-only
 	Show only perf.data header.
 
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+		i	synthesize instructions events
+		b	synthesize branches events
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		e	synthesize error events
+		d	create a debug log
+		g	synthesize a call chain (use with i or x)
+
+	The default is all events i.e. the same as --itrace=ibxe
+
+	In addition, the period (default 100000) for instructions events
+	can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
+	To disable decoding entirely, use --no-itrace.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 3265b1070518..9e5b07eb7d35 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -168,7 +168,7 @@ Default is to monitor all CPUS.
 	Accumulate callchain of children to parent entry so that then can
 	show up in the output.  The output will have a new "Children" column
 	and will be sorted on the data.  It requires -g/--call-graph option
-	enabled.
+	enabled.  See the `overhead calculation' section for more details.
 
 --max-stack::
 	Set the stack depth limit when parsing the callchain, anything
@@ -234,6 +234,7 @@ INTERACTIVE PROMPTING KEYS
 
 Pressing any unmapped key displays a menu, and prompts for input.
 
+include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index ba03fd5d1a54..1db9c8b79880 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -35,7 +35,7 @@ OPTIONS
 
 -e::
 --expr::
-	List of events to show, currently only syscall names.
+	List of syscalls to show, currently only syscall names.
 	Prefixing with ! shows all syscalls but the ones specified.  You may
 	need to escape it.
 
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index c699dc35eef9..d31a7bbd7cee 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -24,7 +24,7 @@ unexport MAKEFLAGS
 # (To override it, run 'make JOBS=1' and similar.)
 #
 ifeq ($(JOBS),)
-  JOBS := $(shell egrep -c '^processor|^CPU' /proc/cpuinfo 2>/dev/null)
+  JOBS := $(shell (getconf _NPROCESSORS_ONLN || egrep -c '^processor|^CPU[0-9]' /proc/cpuinfo) 2>/dev/null)
   ifeq ($(JOBS),0)
     JOBS := 1
   endif
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index c43a20517591..03409cc02117 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -73,6 +73,8 @@ include config/utilities.mak
 # for CTF data format.
 #
 # Define NO_LZMA if you do not want to support compressed (xz) kernel modules
+#
+# Define NO_AUXTRACE if you do not want AUX area tracing support
 
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(shell pwd)))
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 0af6e9b3f728..7b8b0d1a1b62 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,4 +1,5 @@
 libperf-y += header.o
+libperf-y += sym-handling.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
new file mode 100644
index 000000000000..bbc1a50768dd
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -0,0 +1,82 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
+ */
+
+#include "debug.h"
+#include "symbol.h"
+#include "map.h"
+#include "probe-event.h"
+
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+	return ehdr.e_type == ET_EXEC ||
+	       ehdr.e_type == ET_REL ||
+	       ehdr.e_type == ET_DYN;
+}
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+void arch__elf_sym_adjust(GElf_Sym *sym)
+{
+	sym->st_value += PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#endif
+#endif
+
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+int arch__choose_best_symbol(struct symbol *syma,
+			     struct symbol *symb __maybe_unused)
+{
+	char *sym = syma->name;
+
+	/* Skip over any initial dot */
+	if (*sym == '.')
+		sym++;
+
+	/* Avoid "SyS" kernel syscall aliases */
+	if (strlen(sym) >= 3 && !strncmp(sym, "SyS", 3))
+		return SYMBOL_B;
+	if (strlen(sym) >= 10 && !strncmp(sym, "compat_SyS", 10))
+		return SYMBOL_B;
+
+	return SYMBOL_A;
+}
+
+/* Allow matching against dot variants */
+int arch__compare_symbol_names(const char *namea, const char *nameb)
+{
+	/* Skip over initial dot */
+	if (*namea == '.')
+		namea++;
+	if (*nameb == '.')
+		nameb++;
+
+	return strcmp(namea, nameb);
+}
+#endif
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+bool arch__prefers_symtab(void)
+{
+	return true;
+}
+
+#define PPC64LE_LEP_OFFSET	8
+
+void arch__fix_tev_from_maps(struct perf_probe_event *pev,
+			     struct probe_trace_event *tev, struct map *map)
+{
+	/*
+	 * ppc64 ABIv2 local entry point is currently always 2 instructions
+	 * (8 bytes) after the global entry point.
+	 */
+	if (!pev->uprobes && map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) {
+		tev->point.address += PPC64LE_LEP_OFFSET;
+		tev->point.offset += PPC64LE_LEP_OFFSET;
+	}
+}
+#endif
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index ebfa163b80b5..0b704c5f6d90 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -23,6 +23,7 @@
 #include <pthread.h>
 #include <sys/mman.h>
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/wait.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
@@ -51,6 +52,9 @@ struct thread_data {
 	unsigned int		loops_done;
 	u64			val;
 	u64			runtime_ns;
+	u64			system_time_ns;
+	u64			user_time_ns;
+	double			speed_gbs;
 	pthread_mutex_t		*process_lock;
 };
 
@@ -1034,6 +1038,7 @@ static void *worker_thread(void *__tdata)
 	u64 bytes_done;
 	long work_done;
 	u32 l;
+	struct rusage rusage;
 
 	bind_to_cpumask(td->bind_cpumask);
 	bind_to_memnode(td->bind_node);
@@ -1186,6 +1191,13 @@ static void *worker_thread(void *__tdata)
 	timersub(&stop, &start0, &diff);
 	td->runtime_ns = diff.tv_sec * 1000000000ULL;
 	td->runtime_ns += diff.tv_usec * 1000ULL;
+	td->speed_gbs = bytes_done / (td->runtime_ns / 1e9) / 1e9;
+
+	getrusage(RUSAGE_THREAD, &rusage);
+	td->system_time_ns = rusage.ru_stime.tv_sec * 1000000000ULL;
+	td->system_time_ns += rusage.ru_stime.tv_usec * 1000ULL;
+	td->user_time_ns = rusage.ru_utime.tv_sec * 1000000000ULL;
+	td->user_time_ns += rusage.ru_utime.tv_usec * 1000ULL;
 
 	free_data(thread_data, g->p.bytes_thread);
 
@@ -1412,7 +1424,7 @@ static int __bench_numa(const char *name)
 	double runtime_sec_min;
 	int wait_stat;
 	double bytes;
-	int i, t;
+	int i, t, p;
 
 	if (init())
 		return -1;
@@ -1548,6 +1560,24 @@ static int __bench_numa(const char *name)
 	print_res(name, bytes / runtime_sec_max / 1e9,
 		"GB/sec,", "total-speed",	"GB/sec total speed");
 
+	if (g->p.show_details >= 2) {
+		char tname[32];
+		struct thread_data *td;
+		for (p = 0; p < g->p.nr_proc; p++) {
+			for (t = 0; t < g->p.nr_threads; t++) {
+				memset(tname, 0, 32);
+				td = g->threads + p*g->p.nr_threads + t;
+				snprintf(tname, 32, "process%d:thread%d", p, t);
+				print_res(tname, td->speed_gbs,
+					"GB/sec",	"thread-speed", "GB/sec/thread speed");
+				print_res(tname, td->system_time_ns / 1e9,
+					"secs",	"thread-system-time", "system CPU time/thread");
+				print_res(tname, td->user_time_ns / 1e9,
+					"secs",	"thread-user-time", "user CPU time/thread");
+			}
+		}
+	}
+
 	free(pids);
 
 	deinit();
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index feb420f74c2d..9fe93c8d4fcf 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -69,6 +69,15 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 	session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops);
 	if (session == NULL)
 		return -1;
+
+	/*
+	 * We take all buildids when the file contains AUX area tracing data
+	 * because we do not decode the trace because it would take too long.
+	 */
+	if (!perf_data_file__is_pipe(&file) &&
+	    perf_header__has_feat(&session->header, HEADER_AUXTRACE))
+		with_hits = false;
+
 	/*
 	 * in pipe-mode, the only way to get the buildids is to parse
 	 * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 40a33d7334cc..d6a47e854b2b 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -16,6 +16,7 @@
 #include "util/debug.h"
 #include "util/build-id.h"
 #include "util/data.h"
+#include "util/auxtrace.h"
 
 #include "util/parse-options.h"
 
@@ -26,10 +27,12 @@ struct perf_inject {
 	struct perf_session	*session;
 	bool			build_ids;
 	bool			sched_stat;
+	bool			have_auxtrace;
 	const char		*input_name;
 	struct perf_data_file	output;
 	u64			bytes_written;
 	struct list_head	samples;
+	struct itrace_synth_opts itrace_synth_opts;
 };
 
 struct event_entry {
@@ -38,14 +41,11 @@ struct event_entry {
 	union perf_event event[0];
 };
 
-static int perf_event__repipe_synth(struct perf_tool *tool,
-				    union perf_event *event)
+static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
 {
-	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	ssize_t size;
 
-	size = perf_data_file__write(&inject->output, event,
-				     event->header.size);
+	size = perf_data_file__write(&inject->output, buf, sz);
 	if (size < 0)
 		return -errno;
 
@@ -53,6 +53,15 @@ static int perf_event__repipe_synth(struct perf_tool *tool,
 	return 0;
 }
 
+static int perf_event__repipe_synth(struct perf_tool *tool,
+				    union perf_event *event)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject,
+						  tool);
+
+	return output_bytes(inject, event, event->header.size);
+}
+
 static int perf_event__repipe_oe_synth(struct perf_tool *tool,
 				       union perf_event *event,
 				       struct ordered_events *oe __maybe_unused)
@@ -86,6 +95,79 @@ static int perf_event__repipe_attr(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+static int copy_bytes(struct perf_inject *inject, int fd, off_t size)
+{
+	char buf[4096];
+	ssize_t ssz;
+	int ret;
+
+	while (size > 0) {
+		ssz = read(fd, buf, min(size, (off_t)sizeof(buf)));
+		if (ssz < 0)
+			return -errno;
+		ret = output_bytes(inject, buf, ssz);
+		if (ret)
+			return ret;
+		size -= ssz;
+	}
+
+	return 0;
+}
+
+static s64 perf_event__repipe_auxtrace(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_session *session
+				       __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject,
+						  tool);
+	int ret;
+
+	inject->have_auxtrace = true;
+
+	if (!inject->output.is_pipe) {
+		off_t offset;
+
+		offset = lseek(inject->output.fd, 0, SEEK_CUR);
+		if (offset == -1)
+			return -errno;
+		ret = auxtrace_index__auxtrace_event(&session->auxtrace_index,
+						     event, offset);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (perf_data_file__is_pipe(session->file) || !session->one_mmap) {
+		ret = output_bytes(inject, event, event->header.size);
+		if (ret < 0)
+			return ret;
+		ret = copy_bytes(inject, perf_data_file__fd(session->file),
+				 event->auxtrace.size);
+	} else {
+		ret = output_bytes(inject, event,
+				   event->header.size + event->auxtrace.size);
+	}
+	if (ret < 0)
+		return ret;
+
+	return event->auxtrace.size;
+}
+
+#else
+
+static s64
+perf_event__repipe_auxtrace(struct perf_tool *tool __maybe_unused,
+			    union perf_event *event __maybe_unused,
+			    struct perf_session *session __maybe_unused)
+{
+	pr_err("AUX area tracing not supported\n");
+	return -EINVAL;
+}
+
+#endif
+
 static int perf_event__repipe(struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
@@ -155,6 +237,32 @@ static int perf_event__repipe_fork(struct perf_tool *tool,
 	return err;
 }
 
+static int perf_event__repipe_comm(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_comm(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
+static int perf_event__repipe_exit(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_exit(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
 static int perf_event__repipe_tracing_data(struct perf_tool *tool,
 					   union perf_event *event,
 					   struct perf_session *session)
@@ -167,6 +275,18 @@ static int perf_event__repipe_tracing_data(struct perf_tool *tool,
 	return err;
 }
 
+static int perf_event__repipe_id_index(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_session *session)
+{
+	int err;
+
+	perf_event__repipe_synth(tool, event);
+	err = perf_event__process_id_index(tool, event, session);
+
+	return err;
+}
+
 static int dso__read_build_id(struct dso *dso)
 {
 	if (dso->has_build_id)
@@ -351,16 +471,20 @@ static int __cmd_inject(struct perf_inject *inject)
 	struct perf_session *session = inject->session;
 	struct perf_data_file *file_out = &inject->output;
 	int fd = perf_data_file__fd(file_out);
+	u64 output_data_offset;
 
 	signal(SIGINT, sig_handler);
 
-	if (inject->build_ids || inject->sched_stat) {
+	if (inject->build_ids || inject->sched_stat ||
+	    inject->itrace_synth_opts.set) {
 		inject->tool.mmap	  = perf_event__repipe_mmap;
 		inject->tool.mmap2	  = perf_event__repipe_mmap2;
 		inject->tool.fork	  = perf_event__repipe_fork;
 		inject->tool.tracing_data = perf_event__repipe_tracing_data;
 	}
 
+	output_data_offset = session->header.data_offset;
+
 	if (inject->build_ids) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
@@ -379,17 +503,43 @@ static int __cmd_inject(struct perf_inject *inject)
 			else if (!strncmp(name, "sched:sched_stat_", 17))
 				evsel->handler = perf_inject__sched_stat;
 		}
+	} else if (inject->itrace_synth_opts.set) {
+		session->itrace_synth_opts = &inject->itrace_synth_opts;
+		inject->itrace_synth_opts.inject = true;
+		inject->tool.comm	    = perf_event__repipe_comm;
+		inject->tool.exit	    = perf_event__repipe_exit;
+		inject->tool.id_index	    = perf_event__repipe_id_index;
+		inject->tool.auxtrace_info  = perf_event__process_auxtrace_info;
+		inject->tool.auxtrace	    = perf_event__process_auxtrace;
+		inject->tool.ordered_events = true;
+		inject->tool.ordering_requires_timestamps = true;
+		/* Allow space in the header for new attributes */
+		output_data_offset = 4096;
 	}
 
+	if (!inject->itrace_synth_opts.set)
+		auxtrace_index__free(&session->auxtrace_index);
+
 	if (!file_out->is_pipe)
-		lseek(fd, session->header.data_offset, SEEK_SET);
+		lseek(fd, output_data_offset, SEEK_SET);
 
 	ret = perf_session__process_events(session);
 
 	if (!file_out->is_pipe) {
-		if (inject->build_ids)
+		if (inject->build_ids) {
 			perf_header__set_feat(&session->header,
 					      HEADER_BUILD_ID);
+			if (inject->have_auxtrace)
+				dsos__hit_all(session);
+		}
+		/*
+		 * The AUX areas have been removed and replaced with
+		 * synthesized hardware events, so clear the feature flag.
+		 */
+		if (inject->itrace_synth_opts.set)
+			perf_header__clear_feat(&session->header,
+						HEADER_AUXTRACE);
+		session->header.data_offset = output_data_offset;
 		session->header.data_size = inject->bytes_written;
 		perf_session__write_header(session, session->evlist, fd, true);
 	}
@@ -408,11 +558,16 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 			.fork		= perf_event__repipe,
 			.exit		= perf_event__repipe,
 			.lost		= perf_event__repipe,
+			.aux		= perf_event__repipe,
+			.itrace_start	= perf_event__repipe,
 			.read		= perf_event__repipe_sample,
 			.throttle	= perf_event__repipe,
 			.unthrottle	= perf_event__repipe,
 			.attr		= perf_event__repipe_attr,
 			.tracing_data	= perf_event__repipe_op2_synth,
+			.auxtrace_info	= perf_event__repipe_op2_synth,
+			.auxtrace	= perf_event__repipe_auxtrace,
+			.auxtrace_error	= perf_event__repipe_op2_synth,
 			.finished_round	= perf_event__repipe_oe_synth,
 			.build_id	= perf_event__repipe_op2_synth,
 			.id_index	= perf_event__repipe_op2_synth,
@@ -444,6 +599,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
 			   "kallsyms pathname"),
 		OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+		OPT_CALLBACK_OPTARG(0, "itrace", &inject.itrace_synth_opts,
+				    NULL, "opts", "Instruction Tracing options",
+				    itrace_parse_synth_opts),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 63ea01349b6e..e628bf1a0c24 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -10,6 +10,7 @@
 #include "util/header.h"
 #include "util/session.h"
 #include "util/tool.h"
+#include "util/callchain.h"
 
 #include "util/parse-options.h"
 #include "util/trace-event.h"
@@ -21,14 +22,19 @@
 #include <linux/rbtree.h>
 #include <linux/string.h>
 #include <locale.h>
+#include <regex.h>
 
 static int	kmem_slab;
 static int	kmem_page;
 
 static long	kmem_page_size;
+static enum {
+	KMEM_SLAB,
+	KMEM_PAGE,
+} kmem_default = KMEM_SLAB;  /* for backward compatibility */
 
 struct alloc_stat;
-typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+typedef int (*sort_fn_t)(void *, void *);
 
 static int			alloc_flag;
 static int			caller_flag;
@@ -179,8 +185,8 @@ static int perf_evsel__process_alloc_node_event(struct perf_evsel *evsel,
 	return ret;
 }
 
-static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
-static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
+static int ptr_cmp(void *, void *);
+static int slab_callsite_cmp(void *, void *);
 
 static struct alloc_stat *search_alloc_stat(unsigned long ptr,
 					    unsigned long call_site,
@@ -221,7 +227,8 @@ static int perf_evsel__process_free_event(struct perf_evsel *evsel,
 		s_alloc->pingpong++;
 
 		s_caller = search_alloc_stat(0, s_alloc->call_site,
-					     &root_caller_stat, callsite_cmp);
+					     &root_caller_stat,
+					     slab_callsite_cmp);
 		if (!s_caller)
 			return -1;
 		s_caller->pingpong++;
@@ -241,6 +248,8 @@ static unsigned long nr_page_fails;
 static unsigned long nr_page_nomatch;
 
 static bool use_pfn;
+static bool live_page;
+static struct perf_session *kmem_session;
 
 #define MAX_MIGRATE_TYPES  6
 #define MAX_PAGE_ORDER     11
@@ -250,6 +259,7 @@ static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
 struct page_stat {
 	struct rb_node 	node;
 	u64 		page;
+	u64 		callsite;
 	int 		order;
 	unsigned 	gfp_flags;
 	unsigned 	migrate_type;
@@ -259,13 +269,158 @@ struct page_stat {
 	int 		nr_free;
 };
 
-static struct rb_root page_tree;
+static struct rb_root page_live_tree;
 static struct rb_root page_alloc_tree;
 static struct rb_root page_alloc_sorted;
+static struct rb_root page_caller_tree;
+static struct rb_root page_caller_sorted;
 
-static struct page_stat *search_page(unsigned long page, bool create)
+struct alloc_func {
+	u64 start;
+	u64 end;
+	char *name;
+};
+
+static int nr_alloc_funcs;
+static struct alloc_func *alloc_func_list;
+
+static int funcmp(const void *a, const void *b)
+{
+	const struct alloc_func *fa = a;
+	const struct alloc_func *fb = b;
+
+	if (fa->start > fb->start)
+		return 1;
+	else
+		return -1;
+}
+
+static int callcmp(const void *a, const void *b)
+{
+	const struct alloc_func *fa = a;
+	const struct alloc_func *fb = b;
+
+	if (fb->start <= fa->start && fa->end < fb->end)
+		return 0;
+
+	if (fa->start > fb->start)
+		return 1;
+	else
+		return -1;
+}
+
+static int build_alloc_func_list(void)
+{
+	int ret;
+	struct map *kernel_map;
+	struct symbol *sym;
+	struct rb_node *node;
+	struct alloc_func *func;
+	struct machine *machine = &kmem_session->machines.host;
+	regex_t alloc_func_regex;
+	const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?";
+
+	ret = regcomp(&alloc_func_regex, pattern, REG_EXTENDED);
+	if (ret) {
+		char err[BUFSIZ];
+
+		regerror(ret, &alloc_func_regex, err, sizeof(err));
+		pr_err("Invalid regex: %s\n%s", pattern, err);
+		return -EINVAL;
+	}
+
+	kernel_map = machine->vmlinux_maps[MAP__FUNCTION];
+	if (map__load(kernel_map, NULL) < 0) {
+		pr_err("cannot load kernel map\n");
+		return -ENOENT;
+	}
+
+	map__for_each_symbol(kernel_map, sym, node) {
+		if (regexec(&alloc_func_regex, sym->name, 0, NULL, 0))
+			continue;
+
+		func = realloc(alloc_func_list,
+			       (nr_alloc_funcs + 1) * sizeof(*func));
+		if (func == NULL)
+			return -ENOMEM;
+
+		pr_debug("alloc func: %s\n", sym->name);
+		func[nr_alloc_funcs].start = sym->start;
+		func[nr_alloc_funcs].end   = sym->end;
+		func[nr_alloc_funcs].name  = sym->name;
+
+		alloc_func_list = func;
+		nr_alloc_funcs++;
+	}
+
+	qsort(alloc_func_list, nr_alloc_funcs, sizeof(*func), funcmp);
+
+	regfree(&alloc_func_regex);
+	return 0;
+}
+
+/*
+ * Find first non-memory allocation function from callchain.
+ * The allocation functions are in the 'alloc_func_list'.
+ */
+static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample)
+{
+	struct addr_location al;
+	struct machine *machine = &kmem_session->machines.host;
+	struct callchain_cursor_node *node;
+
+	if (alloc_func_list == NULL) {
+		if (build_alloc_func_list() < 0)
+			goto out;
+	}
+
+	al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
+	sample__resolve_callchain(sample, NULL, evsel, &al, 16);
+
+	callchain_cursor_commit(&callchain_cursor);
+	while (true) {
+		struct alloc_func key, *caller;
+		u64 addr;
+
+		node = callchain_cursor_current(&callchain_cursor);
+		if (node == NULL)
+			break;
+
+		key.start = key.end = node->ip;
+		caller = bsearch(&key, alloc_func_list, nr_alloc_funcs,
+				 sizeof(key), callcmp);
+		if (!caller) {
+			/* found */
+			if (node->map)
+				addr = map__unmap_ip(node->map, node->ip);
+			else
+				addr = node->ip;
+
+			return addr;
+		} else
+			pr_debug3("skipping alloc function: %s\n", caller->name);
+
+		callchain_cursor_advance(&callchain_cursor);
+	}
+
+out:
+	pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
+	return sample->ip;
+}
+
+struct sort_dimension {
+	const char		name[20];
+	sort_fn_t		cmp;
+	struct list_head	list;
+};
+
+static LIST_HEAD(page_alloc_sort_input);
+static LIST_HEAD(page_caller_sort_input);
+
+static struct page_stat *
+__page_stat__findnew_page(struct page_stat *pstat, bool create)
 {
-	struct rb_node **node = &page_tree.rb_node;
+	struct rb_node **node = &page_live_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct page_stat *data;
 
@@ -275,7 +430,7 @@ static struct page_stat *search_page(unsigned long page, bool create)
 		parent = *node;
 		data = rb_entry(*node, struct page_stat, node);
 
-		cmp = data->page - page;
+		cmp = data->page - pstat->page;
 		if (cmp < 0)
 			node = &parent->rb_left;
 		else if (cmp > 0)
@@ -289,49 +444,48 @@ static struct page_stat *search_page(unsigned long page, bool create)
 
 	data = zalloc(sizeof(*data));
 	if (data != NULL) {
-		data->page = page;
+		data->page = pstat->page;
+		data->order = pstat->order;
+		data->gfp_flags = pstat->gfp_flags;
+		data->migrate_type = pstat->migrate_type;
 
 		rb_link_node(&data->node, parent, node);
-		rb_insert_color(&data->node, &page_tree);
+		rb_insert_color(&data->node, &page_live_tree);
 	}
 
 	return data;
 }
 
-static int page_stat_cmp(struct page_stat *a, struct page_stat *b)
+static struct page_stat *page_stat__find_page(struct page_stat *pstat)
 {
-	if (a->page > b->page)
-		return -1;
-	if (a->page < b->page)
-		return 1;
-	if (a->order > b->order)
-		return -1;
-	if (a->order < b->order)
-		return 1;
-	if (a->migrate_type > b->migrate_type)
-		return -1;
-	if (a->migrate_type < b->migrate_type)
-		return 1;
-	if (a->gfp_flags > b->gfp_flags)
-		return -1;
-	if (a->gfp_flags < b->gfp_flags)
-		return 1;
-	return 0;
+	return __page_stat__findnew_page(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_page(struct page_stat *pstat)
+{
+	return __page_stat__findnew_page(pstat, true);
 }
 
-static struct page_stat *search_page_alloc_stat(struct page_stat *stat, bool create)
+static struct page_stat *
+__page_stat__findnew_alloc(struct page_stat *pstat, bool create)
 {
 	struct rb_node **node = &page_alloc_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct page_stat *data;
+	struct sort_dimension *sort;
 
 	while (*node) {
-		s64 cmp;
+		int cmp = 0;
 
 		parent = *node;
 		data = rb_entry(*node, struct page_stat, node);
 
-		cmp = page_stat_cmp(data, stat);
+		list_for_each_entry(sort, &page_alloc_sort_input, list) {
+			cmp = sort->cmp(pstat, data);
+			if (cmp)
+				break;
+		}
+
 		if (cmp < 0)
 			node = &parent->rb_left;
 		else if (cmp > 0)
@@ -345,10 +499,10 @@ static struct page_stat *search_page_alloc_stat(struct page_stat *stat, bool cre
 
 	data = zalloc(sizeof(*data));
 	if (data != NULL) {
-		data->page = stat->page;
-		data->order = stat->order;
-		data->gfp_flags = stat->gfp_flags;
-		data->migrate_type = stat->migrate_type;
+		data->page = pstat->page;
+		data->order = pstat->order;
+		data->gfp_flags = pstat->gfp_flags;
+		data->migrate_type = pstat->migrate_type;
 
 		rb_link_node(&data->node, parent, node);
 		rb_insert_color(&data->node, &page_alloc_tree);
@@ -357,6 +511,71 @@ static struct page_stat *search_page_alloc_stat(struct page_stat *stat, bool cre
 	return data;
 }
 
+static struct page_stat *page_stat__find_alloc(struct page_stat *pstat)
+{
+	return __page_stat__findnew_alloc(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_alloc(struct page_stat *pstat)
+{
+	return __page_stat__findnew_alloc(pstat, true);
+}
+
+static struct page_stat *
+__page_stat__findnew_caller(struct page_stat *pstat, bool create)
+{
+	struct rb_node **node = &page_caller_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct page_stat *data;
+	struct sort_dimension *sort;
+
+	while (*node) {
+		int cmp = 0;
+
+		parent = *node;
+		data = rb_entry(*node, struct page_stat, node);
+
+		list_for_each_entry(sort, &page_caller_sort_input, list) {
+			cmp = sort->cmp(pstat, data);
+			if (cmp)
+				break;
+		}
+
+		if (cmp < 0)
+			node = &parent->rb_left;
+		else if (cmp > 0)
+			node = &parent->rb_right;
+		else
+			return data;
+	}
+
+	if (!create)
+		return NULL;
+
+	data = zalloc(sizeof(*data));
+	if (data != NULL) {
+		data->callsite = pstat->callsite;
+		data->order = pstat->order;
+		data->gfp_flags = pstat->gfp_flags;
+		data->migrate_type = pstat->migrate_type;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &page_caller_tree);
+	}
+
+	return data;
+}
+
+static struct page_stat *page_stat__find_caller(struct page_stat *pstat)
+{
+	return __page_stat__findnew_caller(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_caller(struct page_stat *pstat)
+{
+	return __page_stat__findnew_caller(pstat, true);
+}
+
 static bool valid_page(u64 pfn_or_page)
 {
 	if (use_pfn && pfn_or_page == -1UL)
@@ -366,6 +585,176 @@ static bool valid_page(u64 pfn_or_page)
 	return true;
 }
 
+struct gfp_flag {
+	unsigned int flags;
+	char *compact_str;
+	char *human_readable;
+};
+
+static struct gfp_flag *gfps;
+static int nr_gfps;
+
+static int gfpcmp(const void *a, const void *b)
+{
+	const struct gfp_flag *fa = a;
+	const struct gfp_flag *fb = b;
+
+	return fa->flags - fb->flags;
+}
+
+/* see include/trace/events/gfpflags.h */
+static const struct {
+	const char *original;
+	const char *compact;
+} gfp_compact_table[] = {
+	{ "GFP_TRANSHUGE",		"THP" },
+	{ "GFP_HIGHUSER_MOVABLE",	"HUM" },
+	{ "GFP_HIGHUSER",		"HU" },
+	{ "GFP_USER",			"U" },
+	{ "GFP_TEMPORARY",		"TMP" },
+	{ "GFP_KERNEL",			"K" },
+	{ "GFP_NOFS",			"NF" },
+	{ "GFP_ATOMIC",			"A" },
+	{ "GFP_NOIO",			"NI" },
+	{ "GFP_HIGH",			"H" },
+	{ "GFP_WAIT",			"W" },
+	{ "GFP_IO",			"I" },
+	{ "GFP_COLD",			"CO" },
+	{ "GFP_NOWARN",			"NWR" },
+	{ "GFP_REPEAT",			"R" },
+	{ "GFP_NOFAIL",			"NF" },
+	{ "GFP_NORETRY",		"NR" },
+	{ "GFP_COMP",			"C" },
+	{ "GFP_ZERO",			"Z" },
+	{ "GFP_NOMEMALLOC",		"NMA" },
+	{ "GFP_MEMALLOC",		"MA" },
+	{ "GFP_HARDWALL",		"HW" },
+	{ "GFP_THISNODE",		"TN" },
+	{ "GFP_RECLAIMABLE",		"RC" },
+	{ "GFP_MOVABLE",		"M" },
+	{ "GFP_NOTRACK",		"NT" },
+	{ "GFP_NO_KSWAPD",		"NK" },
+	{ "GFP_OTHER_NODE",		"ON" },
+	{ "GFP_NOWAIT",			"NW" },
+};
+
+static size_t max_gfp_len;
+
+static char *compact_gfp_flags(char *gfp_flags)
+{
+	char *orig_flags = strdup(gfp_flags);
+	char *new_flags = NULL;
+	char *str, *pos;
+	size_t len = 0;
+
+	if (orig_flags == NULL)
+		return NULL;
+
+	str = strtok_r(orig_flags, "|", &pos);
+	while (str) {
+		size_t i;
+		char *new;
+		const char *cpt;
+
+		for (i = 0; i < ARRAY_SIZE(gfp_compact_table); i++) {
+			if (strcmp(gfp_compact_table[i].original, str))
+				continue;
+
+			cpt = gfp_compact_table[i].compact;
+			new = realloc(new_flags, len + strlen(cpt) + 2);
+			if (new == NULL) {
+				free(new_flags);
+				return NULL;
+			}
+
+			new_flags = new;
+
+			if (!len) {
+				strcpy(new_flags, cpt);
+			} else {
+				strcat(new_flags, "|");
+				strcat(new_flags, cpt);
+				len++;
+			}
+
+			len += strlen(cpt);
+		}
+
+		str = strtok_r(NULL, "|", &pos);
+	}
+
+	if (max_gfp_len < len)
+		max_gfp_len = len;
+
+	free(orig_flags);
+	return new_flags;
+}
+
+static char *compact_gfp_string(unsigned long gfp_flags)
+{
+	struct gfp_flag key = {
+		.flags = gfp_flags,
+	};
+	struct gfp_flag *gfp;
+
+	gfp = bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp);
+	if (gfp)
+		return gfp->compact_str;
+
+	return NULL;
+}
+
+static int parse_gfp_flags(struct perf_evsel *evsel, struct perf_sample *sample,
+			   unsigned int gfp_flags)
+{
+	struct pevent_record record = {
+		.cpu = sample->cpu,
+		.data = sample->raw_data,
+		.size = sample->raw_size,
+	};
+	struct trace_seq seq;
+	char *str, *pos;
+
+	if (nr_gfps) {
+		struct gfp_flag key = {
+			.flags = gfp_flags,
+		};
+
+		if (bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp))
+			return 0;
+	}
+
+	trace_seq_init(&seq);
+	pevent_event_info(&seq, evsel->tp_format, &record);
+
+	str = strtok_r(seq.buffer, " ", &pos);
+	while (str) {
+		if (!strncmp(str, "gfp_flags=", 10)) {
+			struct gfp_flag *new;
+
+			new = realloc(gfps, (nr_gfps + 1) * sizeof(*gfps));
+			if (new == NULL)
+				return -ENOMEM;
+
+			gfps = new;
+			new += nr_gfps++;
+
+			new->flags = gfp_flags;
+			new->human_readable = strdup(str + 10);
+			new->compact_str = compact_gfp_flags(str + 10);
+			if (!new->human_readable || !new->compact_str)
+				return -ENOMEM;
+
+			qsort(gfps, nr_gfps, sizeof(*gfps), gfpcmp);
+		}
+
+		str = strtok_r(NULL, " ", &pos);
+	}
+
+	trace_seq_destroy(&seq);
+	return 0;
+}
+
 static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
 						struct perf_sample *sample)
 {
@@ -375,7 +764,8 @@ static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
 	unsigned int migrate_type = perf_evsel__intval(evsel, sample,
 						       "migratetype");
 	u64 bytes = kmem_page_size << order;
-	struct page_stat *stat;
+	u64 callsite;
+	struct page_stat *pstat;
 	struct page_stat this = {
 		.order = order,
 		.gfp_flags = gfp_flags,
@@ -397,25 +787,41 @@ static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
 		return 0;
 	}
 
+	if (parse_gfp_flags(evsel, sample, gfp_flags) < 0)
+		return -1;
+
+	callsite = find_callsite(evsel, sample);
+
 	/*
 	 * This is to find the current page (with correct gfp flags and
 	 * migrate type) at free event.
 	 */
-	stat = search_page(page, true);
-	if (stat == NULL)
+	this.page = page;
+	pstat = page_stat__findnew_page(&this);
+	if (pstat == NULL)
 		return -ENOMEM;
 
-	stat->order = order;
-	stat->gfp_flags = gfp_flags;
-	stat->migrate_type = migrate_type;
+	pstat->nr_alloc++;
+	pstat->alloc_bytes += bytes;
+	pstat->callsite = callsite;
 
-	this.page = page;
-	stat = search_page_alloc_stat(&this, true);
-	if (stat == NULL)
+	if (!live_page) {
+		pstat = page_stat__findnew_alloc(&this);
+		if (pstat == NULL)
+			return -ENOMEM;
+
+		pstat->nr_alloc++;
+		pstat->alloc_bytes += bytes;
+		pstat->callsite = callsite;
+	}
+
+	this.callsite = callsite;
+	pstat = page_stat__findnew_caller(&this);
+	if (pstat == NULL)
 		return -ENOMEM;
 
-	stat->nr_alloc++;
-	stat->alloc_bytes += bytes;
+	pstat->nr_alloc++;
+	pstat->alloc_bytes += bytes;
 
 	order_stats[order][migrate_type]++;
 
@@ -428,7 +834,7 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
 	u64 page;
 	unsigned int order = perf_evsel__intval(evsel, sample, "order");
 	u64 bytes = kmem_page_size << order;
-	struct page_stat *stat;
+	struct page_stat *pstat;
 	struct page_stat this = {
 		.order = order,
 	};
@@ -441,8 +847,9 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
 	nr_page_frees++;
 	total_page_free_bytes += bytes;
 
-	stat = search_page(page, false);
-	if (stat == NULL) {
+	this.page = page;
+	pstat = page_stat__find_page(&this);
+	if (pstat == NULL) {
 		pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
 			  page, order);
 
@@ -452,19 +859,40 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
 		return 0;
 	}
 
-	this.page = page;
-	this.gfp_flags = stat->gfp_flags;
-	this.migrate_type = stat->migrate_type;
+	this.gfp_flags = pstat->gfp_flags;
+	this.migrate_type = pstat->migrate_type;
+	this.callsite = pstat->callsite;
 
-	rb_erase(&stat->node, &page_tree);
-	free(stat);
+	rb_erase(&pstat->node, &page_live_tree);
+	free(pstat);
 
-	stat = search_page_alloc_stat(&this, false);
-	if (stat == NULL)
+	if (live_page) {
+		order_stats[this.order][this.migrate_type]--;
+	} else {
+		pstat = page_stat__find_alloc(&this);
+		if (pstat == NULL)
+			return -ENOMEM;
+
+		pstat->nr_free++;
+		pstat->free_bytes += bytes;
+	}
+
+	pstat = page_stat__find_caller(&this);
+	if (pstat == NULL)
 		return -ENOENT;
 
-	stat->nr_free++;
-	stat->free_bytes += bytes;
+	pstat->nr_free++;
+	pstat->free_bytes += bytes;
+
+	if (live_page) {
+		pstat->nr_alloc--;
+		pstat->alloc_bytes -= bytes;
+
+		if (pstat->nr_alloc == 0) {
+			rb_erase(&pstat->node, &page_caller_tree);
+			free(pstat);
+		}
+	}
 
 	return 0;
 }
@@ -576,41 +1004,111 @@ static const char * const migrate_type_str[] = {
 	"UNKNOWN",
 };
 
-static void __print_page_result(struct rb_root *root,
-				struct perf_session *session __maybe_unused,
-				int n_lines)
+static void __print_page_alloc_result(struct perf_session *session, int n_lines)
 {
-	struct rb_node *next = rb_first(root);
+	struct rb_node *next = rb_first(&page_alloc_sorted);
+	struct machine *machine = &session->machines.host;
 	const char *format;
+	int gfp_len = max(strlen("GFP flags"), max_gfp_len);
 
-	printf("\n%.80s\n", graph_dotted_line);
-	printf(" %-16s | Total alloc (KB) | Hits      | Order | Mig.type | GFP flags\n",
-	       use_pfn ? "PFN" : "Page");
-	printf("%.80s\n", graph_dotted_line);
+	printf("\n%.105s\n", graph_dotted_line);
+	printf(" %-16s | %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
+	       use_pfn ? "PFN" : "Page", live_page ? "Live" : "Total",
+	       gfp_len, "GFP flags");
+	printf("%.105s\n", graph_dotted_line);
 
 	if (use_pfn)
-		format = " %16llu | %'16llu | %'9d | %5d | %8s |  %08lx\n";
+		format = " %16llu | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
 	else
-		format = " %016llx | %'16llu | %'9d | %5d | %8s |  %08lx\n";
+		format = " %016llx | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
 
 	while (next && n_lines--) {
 		struct page_stat *data;
+		struct symbol *sym;
+		struct map *map;
+		char buf[32];
+		char *caller = buf;
 
 		data = rb_entry(next, struct page_stat, node);
+		sym = machine__find_kernel_function(machine, data->callsite,
+						    &map, NULL);
+		if (sym && sym->name)
+			caller = sym->name;
+		else
+			scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
 
 		printf(format, (unsigned long long)data->page,
 		       (unsigned long long)data->alloc_bytes / 1024,
 		       data->nr_alloc, data->order,
 		       migrate_type_str[data->migrate_type],
-		       (unsigned long)data->gfp_flags);
+		       gfp_len, compact_gfp_string(data->gfp_flags), caller);
 
 		next = rb_next(next);
 	}
 
-	if (n_lines == -1)
-		printf(" ...              | ...              | ...       | ...   | ...      | ...     \n");
+	if (n_lines == -1) {
+		printf(" ...              | ...              | ...       | ...   | ...      | %-*s | ...\n",
+		       gfp_len, "...");
+	}
 
-	printf("%.80s\n", graph_dotted_line);
+	printf("%.105s\n", graph_dotted_line);
+}
+
+static void __print_page_caller_result(struct perf_session *session, int n_lines)
+{
+	struct rb_node *next = rb_first(&page_caller_sorted);
+	struct machine *machine = &session->machines.host;
+	int gfp_len = max(strlen("GFP flags"), max_gfp_len);
+
+	printf("\n%.105s\n", graph_dotted_line);
+	printf(" %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
+	       live_page ? "Live" : "Total", gfp_len, "GFP flags");
+	printf("%.105s\n", graph_dotted_line);
+
+	while (next && n_lines--) {
+		struct page_stat *data;
+		struct symbol *sym;
+		struct map *map;
+		char buf[32];
+		char *caller = buf;
+
+		data = rb_entry(next, struct page_stat, node);
+		sym = machine__find_kernel_function(machine, data->callsite,
+						    &map, NULL);
+		if (sym && sym->name)
+			caller = sym->name;
+		else
+			scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
+
+		printf(" %'16llu | %'9d | %5d | %8s | %-*s | %s\n",
+		       (unsigned long long)data->alloc_bytes / 1024,
+		       data->nr_alloc, data->order,
+		       migrate_type_str[data->migrate_type],
+		       gfp_len, compact_gfp_string(data->gfp_flags), caller);
+
+		next = rb_next(next);
+	}
+
+	if (n_lines == -1) {
+		printf(" ...              | ...       | ...   | ...      | %-*s | ...\n",
+		       gfp_len, "...");
+	}
+
+	printf("%.105s\n", graph_dotted_line);
+}
+
+static void print_gfp_flags(void)
+{
+	int i;
+
+	printf("#\n");
+	printf("# GFP flags\n");
+	printf("# ---------\n");
+	for (i = 0; i < nr_gfps; i++) {
+		printf("# %08x: %*s: %s\n", gfps[i].flags,
+		       (int) max_gfp_len, gfps[i].compact_str,
+		       gfps[i].human_readable);
+	}
 }
 
 static void print_slab_summary(void)
@@ -682,8 +1180,12 @@ static void print_slab_result(struct perf_session *session)
 
 static void print_page_result(struct perf_session *session)
 {
+	if (caller_flag || alloc_flag)
+		print_gfp_flags();
+	if (caller_flag)
+		__print_page_caller_result(session, caller_lines);
 	if (alloc_flag)
-		__print_page_result(&page_alloc_sorted, session, alloc_lines);
+		__print_page_alloc_result(session, alloc_lines);
 	print_page_summary();
 }
 
@@ -695,14 +1197,10 @@ static void print_result(struct perf_session *session)
 		print_page_result(session);
 }
 
-struct sort_dimension {
-	const char		name[20];
-	sort_fn_t		cmp;
-	struct list_head	list;
-};
-
-static LIST_HEAD(caller_sort);
-static LIST_HEAD(alloc_sort);
+static LIST_HEAD(slab_caller_sort);
+static LIST_HEAD(slab_alloc_sort);
+static LIST_HEAD(page_caller_sort);
+static LIST_HEAD(page_alloc_sort);
 
 static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
 			     struct list_head *sort_list)
@@ -751,10 +1249,12 @@ static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted
 	}
 }
 
-static void sort_page_insert(struct rb_root *root, struct page_stat *data)
+static void sort_page_insert(struct rb_root *root, struct page_stat *data,
+			     struct list_head *sort_list)
 {
 	struct rb_node **new = &root->rb_node;
 	struct rb_node *parent = NULL;
+	struct sort_dimension *sort;
 
 	while (*new) {
 		struct page_stat *this;
@@ -763,8 +1263,11 @@ static void sort_page_insert(struct rb_root *root, struct page_stat *data)
 		this = rb_entry(*new, struct page_stat, node);
 		parent = *new;
 
-		/* TODO: support more sort key */
-		cmp = data->alloc_bytes - this->alloc_bytes;
+		list_for_each_entry(sort, sort_list, list) {
+			cmp = sort->cmp(data, this);
+			if (cmp)
+				break;
+		}
 
 		if (cmp > 0)
 			new = &parent->rb_left;
@@ -776,7 +1279,8 @@ static void sort_page_insert(struct rb_root *root, struct page_stat *data)
 	rb_insert_color(&data->node, root);
 }
 
-static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted)
+static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted,
+			       struct list_head *sort_list)
 {
 	struct rb_node *node;
 	struct page_stat *data;
@@ -788,7 +1292,7 @@ static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted
 
 		rb_erase(node, root);
 		data = rb_entry(node, struct page_stat, node);
-		sort_page_insert(root_sorted, data);
+		sort_page_insert(root_sorted, data, sort_list);
 	}
 }
 
@@ -796,12 +1300,20 @@ static void sort_result(void)
 {
 	if (kmem_slab) {
 		__sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
-				   &alloc_sort);
+				   &slab_alloc_sort);
 		__sort_slab_result(&root_caller_stat, &root_caller_sorted,
-				   &caller_sort);
+				   &slab_caller_sort);
 	}
 	if (kmem_page) {
-		__sort_page_result(&page_alloc_tree, &page_alloc_sorted);
+		if (live_page)
+			__sort_page_result(&page_live_tree, &page_alloc_sorted,
+					   &page_alloc_sort);
+		else
+			__sort_page_result(&page_alloc_tree, &page_alloc_sorted,
+					   &page_alloc_sort);
+
+		__sort_page_result(&page_caller_tree, &page_caller_sorted,
+				   &page_caller_sort);
 	}
 }
 
@@ -850,8 +1362,12 @@ out:
 	return err;
 }
 
-static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+/* slab sort keys */
+static int ptr_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->ptr < r->ptr)
 		return -1;
 	else if (l->ptr > r->ptr)
@@ -864,8 +1380,11 @@ static struct sort_dimension ptr_sort_dimension = {
 	.cmp	= ptr_cmp,
 };
 
-static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int slab_callsite_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->call_site < r->call_site)
 		return -1;
 	else if (l->call_site > r->call_site)
@@ -875,11 +1394,14 @@ static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
 
 static struct sort_dimension callsite_sort_dimension = {
 	.name	= "callsite",
-	.cmp	= callsite_cmp,
+	.cmp	= slab_callsite_cmp,
 };
 
-static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int hit_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->hit < r->hit)
 		return -1;
 	else if (l->hit > r->hit)
@@ -892,8 +1414,11 @@ static struct sort_dimension hit_sort_dimension = {
 	.cmp	= hit_cmp,
 };
 
-static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int bytes_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->bytes_alloc < r->bytes_alloc)
 		return -1;
 	else if (l->bytes_alloc > r->bytes_alloc)
@@ -906,9 +1431,11 @@ static struct sort_dimension bytes_sort_dimension = {
 	.cmp	= bytes_cmp,
 };
 
-static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int frag_cmp(void *a, void *b)
 {
 	double x, y;
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
 
 	x = fragmentation(l->bytes_req, l->bytes_alloc);
 	y = fragmentation(r->bytes_req, r->bytes_alloc);
@@ -925,8 +1452,11 @@ static struct sort_dimension frag_sort_dimension = {
 	.cmp	= frag_cmp,
 };
 
-static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int pingpong_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->pingpong < r->pingpong)
 		return -1;
 	else if (l->pingpong > r->pingpong)
@@ -939,7 +1469,135 @@ static struct sort_dimension pingpong_sort_dimension = {
 	.cmp	= pingpong_cmp,
 };
 
-static struct sort_dimension *avail_sorts[] = {
+/* page sort keys */
+static int page_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->page < r->page)
+		return -1;
+	else if (l->page > r->page)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_sort_dimension = {
+	.name	= "page",
+	.cmp	= page_cmp,
+};
+
+static int page_callsite_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->callsite < r->callsite)
+		return -1;
+	else if (l->callsite > r->callsite)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_callsite_sort_dimension = {
+	.name	= "callsite",
+	.cmp	= page_callsite_cmp,
+};
+
+static int page_hit_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->nr_alloc < r->nr_alloc)
+		return -1;
+	else if (l->nr_alloc > r->nr_alloc)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_hit_sort_dimension = {
+	.name	= "hit",
+	.cmp	= page_hit_cmp,
+};
+
+static int page_bytes_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->alloc_bytes < r->alloc_bytes)
+		return -1;
+	else if (l->alloc_bytes > r->alloc_bytes)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_bytes_sort_dimension = {
+	.name	= "bytes",
+	.cmp	= page_bytes_cmp,
+};
+
+static int page_order_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->order < r->order)
+		return -1;
+	else if (l->order > r->order)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_order_sort_dimension = {
+	.name	= "order",
+	.cmp	= page_order_cmp,
+};
+
+static int migrate_type_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	/* for internal use to find free'd page */
+	if (l->migrate_type == -1U)
+		return 0;
+
+	if (l->migrate_type < r->migrate_type)
+		return -1;
+	else if (l->migrate_type > r->migrate_type)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension migrate_type_sort_dimension = {
+	.name	= "migtype",
+	.cmp	= migrate_type_cmp,
+};
+
+static int gfp_flags_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	/* for internal use to find free'd page */
+	if (l->gfp_flags == -1U)
+		return 0;
+
+	if (l->gfp_flags < r->gfp_flags)
+		return -1;
+	else if (l->gfp_flags > r->gfp_flags)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension gfp_flags_sort_dimension = {
+	.name	= "gfp",
+	.cmp	= gfp_flags_cmp,
+};
+
+static struct sort_dimension *slab_sorts[] = {
 	&ptr_sort_dimension,
 	&callsite_sort_dimension,
 	&hit_sort_dimension,
@@ -948,16 +1606,24 @@ static struct sort_dimension *avail_sorts[] = {
 	&pingpong_sort_dimension,
 };
 
-#define NUM_AVAIL_SORTS	((int)ARRAY_SIZE(avail_sorts))
+static struct sort_dimension *page_sorts[] = {
+	&page_sort_dimension,
+	&page_callsite_sort_dimension,
+	&page_hit_sort_dimension,
+	&page_bytes_sort_dimension,
+	&page_order_sort_dimension,
+	&migrate_type_sort_dimension,
+	&gfp_flags_sort_dimension,
+};
 
-static int sort_dimension__add(const char *tok, struct list_head *list)
+static int slab_sort_dimension__add(const char *tok, struct list_head *list)
 {
 	struct sort_dimension *sort;
 	int i;
 
-	for (i = 0; i < NUM_AVAIL_SORTS; i++) {
-		if (!strcmp(avail_sorts[i]->name, tok)) {
-			sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
+	for (i = 0; i < (int)ARRAY_SIZE(slab_sorts); i++) {
+		if (!strcmp(slab_sorts[i]->name, tok)) {
+			sort = memdup(slab_sorts[i], sizeof(*slab_sorts[i]));
 			if (!sort) {
 				pr_err("%s: memdup failed\n", __func__);
 				return -1;
@@ -970,7 +1636,53 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 	return -1;
 }
 
-static int setup_sorting(struct list_head *sort_list, const char *arg)
+static int page_sort_dimension__add(const char *tok, struct list_head *list)
+{
+	struct sort_dimension *sort;
+	int i;
+
+	for (i = 0; i < (int)ARRAY_SIZE(page_sorts); i++) {
+		if (!strcmp(page_sorts[i]->name, tok)) {
+			sort = memdup(page_sorts[i], sizeof(*page_sorts[i]));
+			if (!sort) {
+				pr_err("%s: memdup failed\n", __func__);
+				return -1;
+			}
+			list_add_tail(&sort->list, list);
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static int setup_slab_sorting(struct list_head *sort_list, const char *arg)
+{
+	char *tok;
+	char *str = strdup(arg);
+	char *pos = str;
+
+	if (!str) {
+		pr_err("%s: strdup failed\n", __func__);
+		return -1;
+	}
+
+	while (true) {
+		tok = strsep(&pos, ",");
+		if (!tok)
+			break;
+		if (slab_sort_dimension__add(tok, sort_list) < 0) {
+			error("Unknown slab --sort key: '%s'", tok);
+			free(str);
+			return -1;
+		}
+	}
+
+	free(str);
+	return 0;
+}
+
+static int setup_page_sorting(struct list_head *sort_list, const char *arg)
 {
 	char *tok;
 	char *str = strdup(arg);
@@ -985,8 +1697,8 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 		tok = strsep(&pos, ",");
 		if (!tok)
 			break;
-		if (sort_dimension__add(tok, sort_list) < 0) {
-			error("Unknown --sort key: '%s'", tok);
+		if (page_sort_dimension__add(tok, sort_list) < 0) {
+			error("Unknown page --sort key: '%s'", tok);
 			free(str);
 			return -1;
 		}
@@ -1002,10 +1714,18 @@ static int parse_sort_opt(const struct option *opt __maybe_unused,
 	if (!arg)
 		return -1;
 
-	if (caller_flag > alloc_flag)
-		return setup_sorting(&caller_sort, arg);
-	else
-		return setup_sorting(&alloc_sort, arg);
+	if (kmem_page > kmem_slab ||
+	    (kmem_page == 0 && kmem_slab == 0 && kmem_default == KMEM_PAGE)) {
+		if (caller_flag > alloc_flag)
+			return setup_page_sorting(&page_caller_sort, arg);
+		else
+			return setup_page_sorting(&page_alloc_sort, arg);
+	} else {
+		if (caller_flag > alloc_flag)
+			return setup_slab_sorting(&slab_caller_sort, arg);
+		else
+			return setup_slab_sorting(&slab_alloc_sort, arg);
+	}
 
 	return 0;
 }
@@ -1084,7 +1804,7 @@ static int __cmd_record(int argc, const char **argv)
 	if (kmem_slab)
 		rec_argc += ARRAY_SIZE(slab_events);
 	if (kmem_page)
-		rec_argc += ARRAY_SIZE(page_events);
+		rec_argc += ARRAY_SIZE(page_events) + 1; /* for -g */
 
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 
@@ -1099,6 +1819,8 @@ static int __cmd_record(int argc, const char **argv)
 			rec_argv[i] = strdup(slab_events[j]);
 	}
 	if (kmem_page) {
+		rec_argv[i++] = strdup("-g");
+
 		for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
 			rec_argv[i] = strdup(page_events[j]);
 	}
@@ -1109,9 +1831,26 @@ static int __cmd_record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
+static int kmem_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "kmem.default")) {
+		if (!strcmp(value, "slab"))
+			kmem_default = KMEM_SLAB;
+		else if (!strcmp(value, "page"))
+			kmem_default = KMEM_PAGE;
+		else
+			pr_err("invalid default value ('slab' or 'page' required): %s\n",
+			       value);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	const char * const default_sort_order = "frag,hit,bytes";
+	const char * const default_slab_sort = "frag,hit,bytes";
+	const char * const default_page_sort = "bytes,hit";
 	struct perf_data_file file = {
 		.mode = PERF_DATA_MODE_READ,
 	};
@@ -1124,8 +1863,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
 			   "show per-allocation statistics", parse_alloc_opt),
 	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
-		     "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
-		     parse_sort_opt),
+		     "sort by keys: ptr, callsite, bytes, hit, pingpong, frag, "
+		     "page, order, migtype, gfp", parse_sort_opt),
 	OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
 	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
@@ -1133,6 +1872,7 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 			   parse_slab_opt),
 	OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
 			   parse_page_opt),
+	OPT_BOOLEAN(0, "live", &live_page, "Show live page stat"),
 	OPT_END()
 	};
 	const char *const kmem_subcommands[] = { "record", "stat", NULL };
@@ -1142,15 +1882,21 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 	};
 	struct perf_session *session;
 	int ret = -1;
+	const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
 
+	perf_config(kmem_config, NULL);
 	argc = parse_options_subcommand(argc, argv, kmem_options,
 					kmem_subcommands, kmem_usage, 0);
 
 	if (!argc)
 		usage_with_options(kmem_usage, kmem_options);
 
-	if (kmem_slab == 0 && kmem_page == 0)
-		kmem_slab = 1;  /* for backward compatibility */
+	if (kmem_slab == 0 && kmem_page == 0) {
+		if (kmem_default == KMEM_SLAB)
+			kmem_slab = 1;
+		else
+			kmem_page = 1;
+	}
 
 	if (!strncmp(argv[0], "rec", 3)) {
 		symbol__init(NULL);
@@ -1159,19 +1905,30 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	file.path = input_name;
 
-	session = perf_session__new(&file, false, &perf_kmem);
+	kmem_session = session = perf_session__new(&file, false, &perf_kmem);
 	if (session == NULL)
 		return -1;
 
+	if (kmem_slab) {
+		if (!perf_evlist__find_tracepoint_by_name(session->evlist,
+							  "kmem:kmalloc")) {
+			pr_err(errmsg, "slab", "slab");
+			return -1;
+		}
+	}
+
 	if (kmem_page) {
-		struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+		struct perf_evsel *evsel;
 
-		if (evsel == NULL || evsel->tp_format == NULL) {
-			pr_err("invalid event found.. aborting\n");
+		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+							     "kmem:mm_page_alloc");
+		if (evsel == NULL) {
+			pr_err(errmsg, "page", "page");
 			return -1;
 		}
 
 		kmem_page_size = pevent_get_page_size(evsel->tp_format->pevent);
+		symbol_conf.use_callchain = true;
 	}
 
 	symbol__init(&session->header.env);
@@ -1182,11 +1939,21 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		if (cpu__setup_cpunode_map())
 			goto out_delete;
 
-		if (list_empty(&caller_sort))
-			setup_sorting(&caller_sort, default_sort_order);
-		if (list_empty(&alloc_sort))
-			setup_sorting(&alloc_sort, default_sort_order);
-
+		if (list_empty(&slab_caller_sort))
+			setup_slab_sorting(&slab_caller_sort, default_slab_sort);
+		if (list_empty(&slab_alloc_sort))
+			setup_slab_sorting(&slab_alloc_sort, default_slab_sort);
+		if (list_empty(&page_caller_sort))
+			setup_page_sorting(&page_caller_sort, default_page_sort);
+		if (list_empty(&page_alloc_sort))
+			setup_page_sorting(&page_alloc_sort, default_page_sort);
+
+		if (kmem_page) {
+			setup_page_sorting(&page_alloc_sort_input,
+					   "page,order,migtype,gfp");
+			setup_page_sorting(&page_caller_sort_input,
+					   "callsite,order,migtype,gfp");
+		}
 		ret = __cmd_kmem(session);
 	} else
 		usage_with_options(kmem_usage, kmem_options);
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index f7b1af67e9f6..53d475b1422e 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -44,22 +44,19 @@
 
 #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
 #define DEFAULT_FUNC_FILTER "!_*"
+#define DEFAULT_LIST_FILTER "*:*"
 
 /* Session management structure */
 static struct {
+	int command;	/* Command short_name */
 	bool list_events;
 	bool force_add;
-	bool show_lines;
-	bool show_vars;
 	bool show_ext_vars;
-	bool show_funcs;
-	bool mod_events;
 	bool uprobes;
 	bool quiet;
 	bool target_used;
 	int nevents;
 	struct perf_probe_event events[MAX_PROBES];
-	struct strlist *dellist;
 	struct line_range line_range;
 	char *target;
 	int max_probe_points;
@@ -93,6 +90,28 @@ static int parse_probe_event(const char *str)
 	return ret;
 }
 
+static int params_add_filter(const char *str)
+{
+	const char *err = NULL;
+	int ret = 0;
+
+	pr_debug2("Add filter: %s\n", str);
+	if (!params.filter) {
+		params.filter = strfilter__new(str, &err);
+		if (!params.filter)
+			ret = err ? -EINVAL : -ENOMEM;
+	} else
+		ret = strfilter__or(params.filter, str, &err);
+
+	if (ret == -EINVAL) {
+		pr_err("Filter parse error at %td.\n", err - str + 1);
+		pr_err("Source: \"%s\"\n", str);
+		pr_err("         %*c\n", (int)(err - str + 1), '^');
+	}
+
+	return ret;
+}
+
 static int set_target(const char *ptr)
 {
 	int found = 0;
@@ -152,34 +171,11 @@ static int parse_probe_event_argv(int argc, const char **argv)
 
 		len += sprintf(&buf[len], "%s ", argv[i]);
 	}
-	params.mod_events = true;
 	ret = parse_probe_event(buf);
 	free(buf);
 	return ret;
 }
 
-static int opt_add_probe_event(const struct option *opt __maybe_unused,
-			      const char *str, int unset __maybe_unused)
-{
-	if (str) {
-		params.mod_events = true;
-		return parse_probe_event(str);
-	} else
-		return 0;
-}
-
-static int opt_del_probe_event(const struct option *opt __maybe_unused,
-			       const char *str, int unset __maybe_unused)
-{
-	if (str) {
-		params.mod_events = true;
-		if (!params.dellist)
-			params.dellist = strlist__new(true, NULL);
-		strlist__add(params.dellist, str);
-	}
-	return 0;
-}
-
 static int opt_set_target(const struct option *opt, const char *str,
 			int unset __maybe_unused)
 {
@@ -217,8 +213,10 @@ static int opt_set_target(const struct option *opt, const char *str,
 	return ret;
 }
 
+/* Command option callbacks */
+
 #ifdef HAVE_DWARF_SUPPORT
-static int opt_show_lines(const struct option *opt __maybe_unused,
+static int opt_show_lines(const struct option *opt,
 			  const char *str, int unset __maybe_unused)
 {
 	int ret = 0;
@@ -226,19 +224,19 @@ static int opt_show_lines(const struct option *opt __maybe_unused,
 	if (!str)
 		return 0;
 
-	if (params.show_lines) {
+	if (params.command == 'L') {
 		pr_warning("Warning: more than one --line options are"
 			   " detected. Only the first one is valid.\n");
 		return 0;
 	}
 
-	params.show_lines = true;
+	params.command = opt->short_name;
 	ret = parse_line_range_desc(str, &params.line_range);
 
 	return ret;
 }
 
-static int opt_show_vars(const struct option *opt __maybe_unused,
+static int opt_show_vars(const struct option *opt,
 			 const char *str, int unset __maybe_unused)
 {
 	struct perf_probe_event *pev = &params.events[params.nevents];
@@ -252,29 +250,39 @@ static int opt_show_vars(const struct option *opt __maybe_unused,
 		pr_err("  Error: '--vars' doesn't accept arguments.\n");
 		return -EINVAL;
 	}
-	params.show_vars = true;
+	params.command = opt->short_name;
 
 	return ret;
 }
 #endif
+static int opt_add_probe_event(const struct option *opt,
+			      const char *str, int unset __maybe_unused)
+{
+	if (str) {
+		params.command = opt->short_name;
+		return parse_probe_event(str);
+	}
+
+	return 0;
+}
+
+static int opt_set_filter_with_command(const struct option *opt,
+				       const char *str, int unset)
+{
+	if (!unset)
+		params.command = opt->short_name;
+
+	if (str)
+		return params_add_filter(str);
+
+	return 0;
+}
 
 static int opt_set_filter(const struct option *opt __maybe_unused,
 			  const char *str, int unset __maybe_unused)
 {
-	const char *err;
-
-	if (str) {
-		pr_debug2("Set filter: %s\n", str);
-		if (params.filter)
-			strfilter__delete(params.filter);
-		params.filter = strfilter__new(str, &err);
-		if (!params.filter) {
-			pr_err("Filter parse error at %td.\n", err - str + 1);
-			pr_err("Source: \"%s\"\n", str);
-			pr_err("         %*c\n", (int)(err - str + 1), '^');
-			return -EINVAL;
-		}
-	}
+	if (str)
+		return params_add_filter(str);
 
 	return 0;
 }
@@ -290,8 +298,6 @@ static void cleanup_params(void)
 
 	for (i = 0; i < params.nevents; i++)
 		clear_perf_probe_event(params.events + i);
-	if (params.dellist)
-		strlist__delete(params.dellist);
 	line_range__clear(&params.line_range);
 	free(params.target);
 	if (params.filter)
@@ -316,22 +322,24 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
 		"perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
 		"perf probe [<options>] --del '[GROUP:]EVENT' ...",
-		"perf probe --list",
+		"perf probe --list [GROUP:]EVENT ...",
 #ifdef HAVE_DWARF_SUPPORT
 		"perf probe [<options>] --line 'LINEDESC'",
 		"perf probe [<options>] --vars 'PROBEPOINT'",
 #endif
+		"perf probe [<options>] --funcs",
 		NULL
-};
+	};
 	struct option options[] = {
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show parsed arguments, etc)"),
 	OPT_BOOLEAN('q', "quiet", &params.quiet,
 		    "be quiet (do not show any mesages)"),
-	OPT_BOOLEAN('l', "list", &params.list_events,
-		    "list up current probe events"),
+	OPT_CALLBACK_DEFAULT('l', "list", NULL, "[GROUP:]EVENT",
+			     "list up probe events",
+			     opt_set_filter_with_command, DEFAULT_LIST_FILTER),
 	OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
-		opt_del_probe_event),
+		     opt_set_filter_with_command),
 	OPT_CALLBACK('a', "add", NULL,
 #ifdef HAVE_DWARF_SUPPORT
 		"[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
@@ -378,8 +386,9 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT__DRY_RUN(&probe_event_dry_run),
 	OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
 		 "Set how many probe points can be found for a probe."),
-	OPT_BOOLEAN('F', "funcs", &params.show_funcs,
-		    "Show potential probe-able functions."),
+	OPT_CALLBACK_DEFAULT('F', "funcs", NULL, "[FILTER]",
+			     "Show potential probe-able functions.",
+			     opt_set_filter_with_command, DEFAULT_FUNC_FILTER),
 	OPT_CALLBACK('\0', "filter", NULL,
 		     "[!]FILTER", "Set a filter (with --vars/funcs only)\n"
 		     "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n"
@@ -402,6 +411,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	set_option_flag(options, 'L', "line", PARSE_OPT_EXCLUSIVE);
 	set_option_flag(options, 'V', "vars", PARSE_OPT_EXCLUSIVE);
 #endif
+	set_option_flag(options, 'F', "funcs", PARSE_OPT_EXCLUSIVE);
 
 	argc = parse_options(argc, argv, options, probe_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
@@ -410,11 +420,16 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 			pr_warning("  Error: '-' is not supported.\n");
 			usage_with_options(probe_usage, options);
 		}
+		if (params.command && params.command != 'a') {
+			pr_warning("  Error: another command except --add is set.\n");
+			usage_with_options(probe_usage, options);
+		}
 		ret = parse_probe_event_argv(argc, argv);
 		if (ret < 0) {
 			pr_err_with_code("  Error: Command Parse Error.", ret);
 			return ret;
 		}
+		params.command = 'a';
 	}
 
 	if (params.quiet) {
@@ -428,47 +443,35 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (params.max_probe_points == 0)
 		params.max_probe_points = MAX_PROBES;
 
-	if ((!params.nevents && !params.dellist && !params.list_events &&
-	     !params.show_lines && !params.show_funcs))
-		usage_with_options(probe_usage, options);
-
 	/*
 	 * Only consider the user's kernel image path if given.
 	 */
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 
-	if (params.list_events) {
+	switch (params.command) {
+	case 'l':
 		if (params.uprobes) {
 			pr_warning("  Error: Don't use --list with --exec.\n");
 			usage_with_options(probe_usage, options);
 		}
-		ret = show_perf_probe_events();
+		ret = show_perf_probe_events(params.filter);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show event list.", ret);
 		return ret;
-	}
-	if (params.show_funcs) {
-		if (!params.filter)
-			params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
-						       NULL);
+	case 'F':
 		ret = show_available_funcs(params.target, params.filter,
 					params.uprobes);
-		strfilter__delete(params.filter);
-		params.filter = NULL;
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show functions.", ret);
 		return ret;
-	}
-
 #ifdef HAVE_DWARF_SUPPORT
-	if (params.show_lines) {
+	case 'L':
 		ret = show_line_range(&params.line_range, params.target,
 				      params.uprobes);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show lines.", ret);
 		return ret;
-	}
-	if (params.show_vars) {
+	case 'V':
 		if (!params.filter)
 			params.filter = strfilter__new(DEFAULT_VAR_FILTER,
 						       NULL);
@@ -478,23 +481,18 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 					  params.target,
 					  params.filter,
 					  params.show_ext_vars);
-		strfilter__delete(params.filter);
-		params.filter = NULL;
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show vars.", ret);
 		return ret;
-	}
 #endif
-
-	if (params.dellist) {
-		ret = del_perf_probe_events(params.dellist);
+	case 'd':
+		ret = del_perf_probe_events(params.filter);
 		if (ret < 0) {
 			pr_err_with_code("  Error: Failed to delete events.", ret);
 			return ret;
 		}
-	}
-
-	if (params.nevents) {
+		break;
+	case 'a':
 		/* Ensure the last given target is used */
 		if (params.target && !params.target_used) {
 			pr_warning("  Error: -x/-m must follow the probe definitions.\n");
@@ -508,6 +506,9 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 			pr_err_with_code("  Error: Failed to add events.", ret);
 			return ret;
 		}
+		break;
+	default:
+		usage_with_options(probe_usage, options);
 	}
 	return 0;
 }
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c3efdfb630b5..5dfe91395617 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -27,6 +27,7 @@
 #include "util/cpumap.h"
 #include "util/thread_map.h"
 #include "util/data.h"
+#include "util/auxtrace.h"
 
 #include <unistd.h>
 #include <sched.h>
@@ -38,6 +39,7 @@ struct record {
 	struct record_opts	opts;
 	u64			bytes_written;
 	struct perf_data_file	file;
+	struct auxtrace_record	*itr;
 	struct perf_evlist	*evlist;
 	struct perf_session	*session;
 	const char		*progname;
@@ -110,9 +112,12 @@ out:
 	return rc;
 }
 
-static volatile int done = 0;
+static volatile int done;
 static volatile int signr = -1;
-static volatile int child_finished = 0;
+static volatile int child_finished;
+static volatile int auxtrace_snapshot_enabled;
+static volatile int auxtrace_snapshot_err;
+static volatile int auxtrace_record__snapshot_started;
 
 static void sig_handler(int sig)
 {
@@ -133,6 +138,133 @@ static void record__sig_exit(void)
 	raise(signr);
 }
 
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+static int record__process_auxtrace(struct perf_tool *tool,
+				    union perf_event *event, void *data1,
+				    size_t len1, void *data2, size_t len2)
+{
+	struct record *rec = container_of(tool, struct record, tool);
+	struct perf_data_file *file = &rec->file;
+	size_t padding;
+	u8 pad[8] = {0};
+
+	if (!perf_data_file__is_pipe(file)) {
+		off_t file_offset;
+		int fd = perf_data_file__fd(file);
+		int err;
+
+		file_offset = lseek(fd, 0, SEEK_CUR);
+		if (file_offset == -1)
+			return -1;
+		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
+						     event, file_offset);
+		if (err)
+			return err;
+	}
+
+	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
+	padding = (len1 + len2) & 7;
+	if (padding)
+		padding = 8 - padding;
+
+	record__write(rec, event, event->header.size);
+	record__write(rec, data1, len1);
+	if (len2)
+		record__write(rec, data2, len2);
+	record__write(rec, &pad, padding);
+
+	return 0;
+}
+
+static int record__auxtrace_mmap_read(struct record *rec,
+				      struct auxtrace_mmap *mm)
+{
+	int ret;
+
+	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
+				  record__process_auxtrace);
+	if (ret < 0)
+		return ret;
+
+	if (ret)
+		rec->samples++;
+
+	return 0;
+}
+
+static int record__auxtrace_mmap_read_snapshot(struct record *rec,
+					       struct auxtrace_mmap *mm)
+{
+	int ret;
+
+	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
+					   record__process_auxtrace,
+					   rec->opts.auxtrace_snapshot_size);
+	if (ret < 0)
+		return ret;
+
+	if (ret)
+		rec->samples++;
+
+	return 0;
+}
+
+static int record__auxtrace_read_snapshot_all(struct record *rec)
+{
+	int i;
+	int rc = 0;
+
+	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+		struct auxtrace_mmap *mm =
+				&rec->evlist->mmap[i].auxtrace_mmap;
+
+		if (!mm->base)
+			continue;
+
+		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
+			rc = -1;
+			goto out;
+		}
+	}
+out:
+	return rc;
+}
+
+static void record__read_auxtrace_snapshot(struct record *rec)
+{
+	pr_debug("Recording AUX area tracing snapshot\n");
+	if (record__auxtrace_read_snapshot_all(rec) < 0) {
+		auxtrace_snapshot_err = -1;
+	} else {
+		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
+		if (!auxtrace_snapshot_err)
+			auxtrace_snapshot_enabled = 1;
+	}
+}
+
+#else
+
+static inline
+int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
+			       struct auxtrace_mmap *mm __maybe_unused)
+{
+	return 0;
+}
+
+static inline
+void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
+{
+}
+
+static inline
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
+{
+	return 0;
+}
+
+#endif
+
 static int record__open(struct record *rec)
 {
 	char msg[512];
@@ -169,13 +301,16 @@ try_again:
 		goto out;
 	}
 
-	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
+				 opts->auxtrace_mmap_pages,
+				 opts->auxtrace_snapshot_mode) < 0) {
 		if (errno == EPERM) {
 			pr_err("Permission error mapping pages.\n"
 			       "Consider increasing "
 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
 			       "or try again with a smaller value of -m/--mmap_pages.\n"
-			       "(current value: %u)\n", opts->mmap_pages);
+			       "(current value: %u,%u)\n",
+			       opts->mmap_pages, opts->auxtrace_mmap_pages);
 			rc = -errno;
 		} else {
 			pr_err("failed to mmap with %d (%s)\n", errno,
@@ -270,12 +405,20 @@ static int record__mmap_read_all(struct record *rec)
 	int rc = 0;
 
 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
+
 		if (rec->evlist->mmap[i].base) {
 			if (record__mmap_read(rec, i) != 0) {
 				rc = -1;
 				goto out;
 			}
 		}
+
+		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
+		    record__auxtrace_mmap_read(rec, mm) != 0) {
+			rc = -1;
+			goto out;
+		}
 	}
 
 	/*
@@ -305,6 +448,9 @@ static void record__init_features(struct record *rec)
 
 	if (!rec->opts.branch_stack)
 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
+
+	if (!rec->opts.full_auxtrace)
+		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
 }
 
 static volatile int workload_exec_errno;
@@ -323,6 +469,8 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
 	child_finished = 1;
 }
 
+static void snapshot_sig_handler(int sig);
+
 static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
 	int err;
@@ -343,6 +491,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 	signal(SIGTERM, sig_handler);
+	if (rec->opts.auxtrace_snapshot_mode)
+		signal(SIGUSR2, snapshot_sig_handler);
+	else
+		signal(SIGUSR2, SIG_IGN);
 
 	session = perf_session__new(file, false, tool);
 	if (session == NULL) {
@@ -421,6 +573,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (rec->opts.full_auxtrace) {
+		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+					session, process_synthesized_event);
+		if (err)
+			goto out_delete_session;
+	}
+
 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 						 machine);
 	if (err < 0)
@@ -475,14 +634,27 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		perf_evlist__enable(rec->evlist);
 	}
 
+	auxtrace_snapshot_enabled = 1;
 	for (;;) {
 		int hits = rec->samples;
 
 		if (record__mmap_read_all(rec) < 0) {
+			auxtrace_snapshot_enabled = 0;
 			err = -1;
 			goto out_child;
 		}
 
+		if (auxtrace_record__snapshot_started) {
+			auxtrace_record__snapshot_started = 0;
+			if (!auxtrace_snapshot_err)
+				record__read_auxtrace_snapshot(rec);
+			if (auxtrace_snapshot_err) {
+				pr_err("AUX area tracing snapshot failed\n");
+				err = -1;
+				goto out_child;
+			}
+		}
+
 		if (hits == rec->samples) {
 			if (done || draining)
 				break;
@@ -505,10 +677,12 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		 * disable events in this case.
 		 */
 		if (done && !disabled && !target__none(&opts->target)) {
+			auxtrace_snapshot_enabled = 0;
 			perf_evlist__disable(rec->evlist);
 			disabled = true;
 		}
 	}
+	auxtrace_snapshot_enabled = 0;
 
 	if (forks && workload_exec_errno) {
 		char msg[STRERR_BUFSIZE];
@@ -545,15 +719,23 @@ out_child:
 	if (!err && !file->is_pipe) {
 		rec->session->header.data_size += rec->bytes_written;
 
-		if (!rec->no_buildid)
+		if (!rec->no_buildid) {
 			process_buildids(rec);
+			/*
+			 * We take all buildids when the file contains
+			 * AUX area tracing data because we do not decode the
+			 * trace because it would take too long.
+			 */
+			if (rec->opts.full_auxtrace)
+				dsos__hit_all(rec->session);
+		}
 		perf_session__write_header(rec->session, rec->evlist, fd, true);
 	}
 
 	if (!err && !quiet) {
 		char samples[128];
 
-		if (rec->samples)
+		if (rec->samples && !rec->opts.full_auxtrace)
 			scnprintf(samples, sizeof(samples),
 				  " (%" PRIu64 " samples)", rec->samples);
 		else
@@ -795,6 +977,49 @@ static int parse_clockid(const struct option *opt, const char *str, int unset)
 	return -1;
 }
 
+static int record__parse_mmap_pages(const struct option *opt,
+				    const char *str,
+				    int unset __maybe_unused)
+{
+	struct record_opts *opts = opt->value;
+	char *s, *p;
+	unsigned int mmap_pages;
+	int ret;
+
+	if (!str)
+		return -EINVAL;
+
+	s = strdup(str);
+	if (!s)
+		return -ENOMEM;
+
+	p = strchr(s, ',');
+	if (p)
+		*p = '\0';
+
+	if (*s) {
+		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
+		if (ret)
+			goto out_free;
+		opts->mmap_pages = mmap_pages;
+	}
+
+	if (!p) {
+		ret = 0;
+		goto out_free;
+	}
+
+	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
+	if (ret)
+		goto out_free;
+
+	opts->auxtrace_mmap_pages = mmap_pages;
+
+out_free:
+	free(s);
+	return ret;
+}
+
 static const char * const __record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
@@ -875,9 +1100,9 @@ struct option __record_options[] = {
 			&record.opts.no_inherit_set,
 			"child tasks do not inherit counters"),
 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
-	OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
-		     "number of mmap data pages",
-		     perf_evlist__parse_mmap_pages),
+	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
+		     "number of mmap data pages and AUX area tracing mmap pages",
+		     record__parse_mmap_pages),
 	OPT_BOOLEAN(0, "group", &record.opts.group,
 		    "put the counters into a counter group"),
 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
@@ -929,6 +1154,8 @@ struct option __record_options[] = {
 	OPT_CALLBACK('k', "clockid", &record.opts,
 	"clockid", "clockid to use for events, see clock_gettime()",
 	parse_clockid),
+	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
+			  "opts", "AUX area tracing Snapshot Mode", ""),
 	OPT_END()
 };
 
@@ -936,7 +1163,7 @@ struct option *record_options = __record_options;
 
 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	int err = -ENOMEM;
+	int err;
 	struct record *rec = &record;
 	char errbuf[BUFSIZ];
 
@@ -957,6 +1184,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 		usage_with_options(record_usage, record_options);
 	}
 
+	if (!rec->itr) {
+		rec->itr = auxtrace_record__init(rec->evlist, &err);
+		if (err)
+			return err;
+	}
+
+	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
+					      rec->opts.auxtrace_snapshot_opts);
+	if (err)
+		return err;
+
+	err = -ENOMEM;
+
 	symbol__init(NULL);
 
 	if (symbol_conf.kptr_restrict)
@@ -1002,6 +1242,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
 		usage_with_options(record_usage, record_options);
 
+	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
+	if (err)
+		goto out_symbol_exit;
+
 	if (record_opts__config(&rec->opts)) {
 		err = -EINVAL;
 		goto out_symbol_exit;
@@ -1011,5 +1255,15 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 out_symbol_exit:
 	perf_evlist__delete(rec->evlist);
 	symbol__exit();
+	auxtrace_record__free(rec->itr);
 	return err;
 }
+
+static void snapshot_sig_handler(int sig __maybe_unused)
+{
+	if (!auxtrace_snapshot_enabled)
+		return;
+	auxtrace_snapshot_enabled = 0;
+	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+	auxtrace_record__snapshot_started = 1;
+}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 476cdf7afcca..18cb0ff39b4e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -36,6 +36,8 @@
 #include "util/data.h"
 #include "arch/common.h"
 
+#include "util/auxtrace.h"
+
 #include <dlfcn.h>
 #include <linux/bitmap.h>
 
@@ -585,6 +587,7 @@ parse_percent_limit(const struct option *opt, const char *str,
 int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	struct perf_session *session;
+	struct itrace_synth_opts itrace_synth_opts = { .set = 0, };
 	struct stat st;
 	bool has_br_stack = false;
 	int branch_mode = -1;
@@ -607,6 +610,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 			.attr		 = perf_event__process_attr,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
+			.id_index	 = perf_event__process_id_index,
+			.auxtrace_info	 = perf_event__process_auxtrace_info,
+			.auxtrace	 = perf_event__process_auxtrace,
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -717,6 +723,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "Don't show entries under that percent", parse_percent_limit),
 	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
 		     "how to display percentage of filtered entries", parse_filter_percentage),
+	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
+			    "Instruction Tracing options",
+			    itrace_parse_synth_opts),
 	OPT_END()
 	};
 	struct perf_data_file file = {
@@ -761,6 +770,8 @@ repeat:
 					       report.queue_size);
 	}
 
+	session->itrace_synth_opts = &itrace_synth_opts;
+
 	report.session = session;
 
 	has_br_stack = perf_header__has_feat(&session->header,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 58f10b8e6ff2..6805098e3751 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -16,6 +16,7 @@
 #include "util/evsel.h"
 #include "util/sort.h"
 #include "util/data.h"
+#include "util/auxtrace.h"
 #include <linux/bitmap.h>
 
 static char const		*script_name;
@@ -26,6 +27,7 @@ static u64			nr_unordered;
 static bool			no_callchain;
 static bool			latency_format;
 static bool			system_wide;
+static bool			print_flags;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
@@ -146,9 +148,10 @@ static const char *output_field2str(enum perf_output_field field)
 
 #define PRINT_FIELD(x)  (output[attr->type].fields & PERF_OUTPUT_##x)
 
-static int perf_evsel__check_stype(struct perf_evsel *evsel,
-				   u64 sample_type, const char *sample_msg,
-				   enum perf_output_field field)
+static int perf_evsel__do_check_stype(struct perf_evsel *evsel,
+				      u64 sample_type, const char *sample_msg,
+				      enum perf_output_field field,
+				      bool allow_user_set)
 {
 	struct perf_event_attr *attr = &evsel->attr;
 	int type = attr->type;
@@ -158,6 +161,8 @@ static int perf_evsel__check_stype(struct perf_evsel *evsel,
 		return 0;
 
 	if (output[type].user_set) {
+		if (allow_user_set)
+			return 0;
 		evname = perf_evsel__name(evsel);
 		pr_err("Samples for '%s' event do not have %s attribute set. "
 		       "Cannot print '%s' field.\n",
@@ -175,10 +180,22 @@ static int perf_evsel__check_stype(struct perf_evsel *evsel,
 	return 0;
 }
 
+static int perf_evsel__check_stype(struct perf_evsel *evsel,
+				   u64 sample_type, const char *sample_msg,
+				   enum perf_output_field field)
+{
+	return perf_evsel__do_check_stype(evsel, sample_type, sample_msg, field,
+					  false);
+}
+
 static int perf_evsel__check_attr(struct perf_evsel *evsel,
 				  struct perf_session *session)
 {
 	struct perf_event_attr *attr = &evsel->attr;
+	bool allow_user_set;
+
+	allow_user_set = perf_header__has_feat(&session->header,
+					       HEADER_AUXTRACE);
 
 	if (PRINT_FIELD(TRACE) &&
 		!perf_session__has_traces(session, "record -R"))
@@ -191,8 +208,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 	}
 
 	if (PRINT_FIELD(ADDR) &&
-		perf_evsel__check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
-					PERF_OUTPUT_ADDR))
+		perf_evsel__do_check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
+					   PERF_OUTPUT_ADDR, allow_user_set))
 		return -EINVAL;
 
 	if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
@@ -229,8 +246,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		return -EINVAL;
 
 	if (PRINT_FIELD(CPU) &&
-		perf_evsel__check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
-					PERF_OUTPUT_CPU))
+		perf_evsel__do_check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
+					   PERF_OUTPUT_CPU, allow_user_set))
 		return -EINVAL;
 
 	if (PRINT_FIELD(PERIOD) &&
@@ -445,6 +462,25 @@ static void print_sample_bts(union perf_event *event,
 	printf("\n");
 }
 
+static void print_sample_flags(u32 flags)
+{
+	const char *chars = PERF_IP_FLAG_CHARS;
+	const int n = strlen(PERF_IP_FLAG_CHARS);
+	char str[33];
+	int i, pos = 0;
+
+	for (i = 0; i < n; i++, flags >>= 1) {
+		if (flags & 1)
+			str[pos++] = chars[i];
+	}
+	for (; i < 32; i++, flags >>= 1) {
+		if (flags & 1)
+			str[pos++] = '?';
+	}
+	str[pos] = 0;
+	printf("  %-4s ", str);
+}
+
 static void process_event(union perf_event *event, struct perf_sample *sample,
 			  struct perf_evsel *evsel, struct addr_location *al)
 {
@@ -464,6 +500,9 @@ static void process_event(union perf_event *event, struct perf_sample *sample,
 		printf("%s: ", evname ? evname : "[unknown]");
 	}
 
+	if (print_flags)
+		print_sample_flags(sample->flags);
+
 	if (is_bts_event(attr)) {
 		print_sample_bts(event, sample, evsel, thread, al);
 		return;
@@ -999,12 +1038,15 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 		}
 	}
 
-	tok = strtok(tok, ",");
-	while (tok) {
+	for (tok = strtok(tok, ","); tok; tok = strtok(NULL, ",")) {
 		for (i = 0; i < imax; ++i) {
 			if (strcmp(tok, all_output_options[i].str) == 0)
 				break;
 		}
+		if (i == imax && strcmp(tok, "flags") == 0) {
+			print_flags = true;
+			continue;
+		}
 		if (i == imax) {
 			fprintf(stderr, "Invalid field requested.\n");
 			rc = -EINVAL;
@@ -1032,8 +1074,6 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 			}
 			output[type].fields |= all_output_options[i].field;
 		}
-
-		tok = strtok(NULL, ",");
 	}
 
 	if (type >= 0) {
@@ -1497,6 +1537,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	char *rec_script_path = NULL;
 	char *rep_script_path = NULL;
 	struct perf_session *session;
+	struct itrace_synth_opts itrace_synth_opts = { .set = false, };
 	char *script_path = NULL;
 	const char **__argv;
 	int i, j, err = 0;
@@ -1511,6 +1552,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.attr		 = process_attr,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
+			.id_index	 = perf_event__process_id_index,
+			.auxtrace_info	 = perf_event__process_auxtrace_info,
+			.auxtrace	 = perf_event__process_auxtrace,
+			.auxtrace_error	 = perf_event__process_auxtrace_error,
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -1549,7 +1594,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "comma separated output fields prepend with 'type:'. "
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
-		     "addr,symoff,period", parse_output_fields),
+		     "addr,symoff,period,flags", parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -1570,6 +1615,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events,
 		    "Show the mmap events"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
+			    "Instruction Tracing options",
+			    itrace_parse_synth_opts),
 	OPT_END()
 	};
 	const char * const script_subcommands[] = { "record", "report", NULL };
@@ -1765,6 +1813,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	script.session = session;
 
+	session->itrace_synth_opts = &itrace_synth_opts;
+
 	if (cpu_list) {
 		err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
 		if (err < 0)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f7b8218785f6..fd577f725d23 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -247,21 +247,50 @@ out_free:
 	return -1;
 }
 
+enum {
+	CTX_BIT_USER	= 1 << 0,
+	CTX_BIT_KERNEL	= 1 << 1,
+	CTX_BIT_HV	= 1 << 2,
+	CTX_BIT_HOST	= 1 << 3,
+	CTX_BIT_IDLE	= 1 << 4,
+	CTX_BIT_MAX	= 1 << 5,
+};
+
+#define NUM_CTX CTX_BIT_MAX
+
 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
-static struct stats runtime_branches_stats[MAX_NR_CPUS];
-static struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
-static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats walltime_nsecs_stats;
-static struct stats runtime_transaction_stats[MAX_NR_CPUS];
-static struct stats runtime_elision_stats[MAX_NR_CPUS];
+static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+
+static int evsel_context(struct perf_evsel *evsel)
+{
+	int ctx = 0;
+
+	if (evsel->attr.exclude_kernel)
+		ctx |= CTX_BIT_KERNEL;
+	if (evsel->attr.exclude_user)
+		ctx |= CTX_BIT_USER;
+	if (evsel->attr.exclude_hv)
+		ctx |= CTX_BIT_HV;
+	if (evsel->attr.exclude_host)
+		ctx |= CTX_BIT_HOST;
+	if (evsel->attr.exclude_idle)
+		ctx |= CTX_BIT_IDLE;
+
+	return ctx;
+}
 
 static void perf_stat__reset_stats(struct perf_evlist *evlist)
 {
@@ -356,37 +385,39 @@ static struct perf_evsel *nth_evsel(int n)
 static void update_shadow_stats(struct perf_evsel *counter, u64 *count,
 				int cpu)
 {
+	int ctx = evsel_context(counter);
+
 	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
 		update_stats(&runtime_nsecs_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
-		update_stats(&runtime_cycles_stats[cpu], count[0]);
+		update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
-		update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]);
+		update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
-		update_stats(&runtime_transaction_stats[cpu], count[0]);
+		update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
-		update_stats(&runtime_elision_stats[cpu], count[0]);
+		update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
-		update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]);
+		update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
-		update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]);
+		update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
-		update_stats(&runtime_branches_stats[cpu], count[0]);
+		update_stats(&runtime_branches_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
-		update_stats(&runtime_cacherefs_stats[cpu], count[0]);
+		update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
-		update_stats(&runtime_l1_dcache_stats[cpu], count[0]);
+		update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
-		update_stats(&runtime_l1_icache_stats[cpu], count[0]);
+		update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
-		update_stats(&runtime_ll_cache_stats[cpu], count[0]);
+		update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
-		update_stats(&runtime_dtlb_cache_stats[cpu], count[0]);
+		update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
-		update_stats(&runtime_itlb_cache_stats[cpu], count[0]);
+		update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
 }
 
 static void zero_per_pkg(struct perf_evsel *counter)
@@ -908,8 +939,9 @@ static void print_stalled_cycles_frontend(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_cycles_stats[cpu]);
+	total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -927,8 +959,9 @@ static void print_stalled_cycles_backend(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_cycles_stats[cpu]);
+	total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -946,8 +979,9 @@ static void print_branch_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_branches_stats[cpu]);
+	total = avg_stats(&runtime_branches_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -965,8 +999,9 @@ static void print_l1_dcache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_l1_dcache_stats[cpu]);
+	total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -984,8 +1019,9 @@ static void print_l1_icache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_l1_icache_stats[cpu]);
+	total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1003,8 +1039,9 @@ static void print_dtlb_cache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
+	total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1022,8 +1059,9 @@ static void print_itlb_cache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_itlb_cache_stats[cpu]);
+	total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1041,8 +1079,9 @@ static void print_ll_cache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_ll_cache_stats[cpu]);
+	total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1060,6 +1099,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 	double sc =  evsel->scale;
 	const char *fmt;
 	int cpu = cpu_map__id_to_cpu(id);
+	int ctx = evsel_context(evsel);
 
 	if (csv_output) {
 		fmt = sc != 1.0 ?  "%.2f%s" : "%.0f%s";
@@ -1091,15 +1131,15 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		return;
 
 	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
-		total = avg_stats(&runtime_cycles_stats[cpu]);
+		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		if (total) {
 			ratio = avg / total;
 			fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
 		} else {
 			fprintf(output, "                                   ");
 		}
-		total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
-		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
+		total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
+		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
 
 		if (total && avg) {
 			ratio = total / avg;
@@ -1110,46 +1150,46 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		}
 
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-			runtime_branches_stats[cpu].n != 0) {
+			runtime_branches_stats[ctx][cpu].n != 0) {
 		print_branch_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_l1_dcache_stats[cpu].n != 0) {
+			runtime_l1_dcache_stats[ctx][cpu].n != 0) {
 		print_l1_dcache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_l1_icache_stats[cpu].n != 0) {
+			runtime_l1_icache_stats[ctx][cpu].n != 0) {
 		print_l1_icache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_dtlb_cache_stats[cpu].n != 0) {
+			runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
 		print_dtlb_cache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_itlb_cache_stats[cpu].n != 0) {
+			runtime_itlb_cache_stats[ctx][cpu].n != 0) {
 		print_itlb_cache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_ll_cache_stats[cpu].n != 0) {
+			runtime_ll_cache_stats[ctx][cpu].n != 0) {
 		print_ll_cache_misses(cpu, evsel, avg);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-			runtime_cacherefs_stats[cpu].n != 0) {
-		total = avg_stats(&runtime_cacherefs_stats[cpu]);
+			runtime_cacherefs_stats[ctx][cpu].n != 0) {
+		total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
 
 		if (total)
 			ratio = avg * 100 / total;
@@ -1171,15 +1211,15 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		}
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
-		total = avg_stats(&runtime_cycles_stats[cpu]);
+		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		if (total)
 			fprintf(output,
 				" #   %5.2f%% transactional cycles   ",
 				100.0 * (avg / total));
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
-		total = avg_stats(&runtime_cycles_stats[cpu]);
-		total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+		total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 		if (total2 < avg)
 			total2 = avg;
 		if (total)
@@ -1189,8 +1229,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
 		   avg > 0 &&
-		   runtime_cycles_in_tx_stats[cpu].n != 0) {
-		total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		   runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
 		if (total)
 			ratio = total / avg;
@@ -1199,8 +1239,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
 		   avg > 0 &&
-		   runtime_cycles_in_tx_stats[cpu].n != 0) {
-		total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		   runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
 		if (total)
 			ratio = total / avg;
@@ -1541,7 +1581,7 @@ static int setup_events(const char * const *attrs, unsigned len)
 	unsigned i;
 
 	for (i = 0; i < len; i++) {
-		if (parse_events(evsel_list, attrs[i]))
+		if (parse_events(evsel_list, attrs[i], NULL))
 			return -1;
 	}
 	return 0;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index e124741be187..d1139b6bd534 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2660,16 +2660,15 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
 		    "show the thread COMM next to its id"),
 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
-	OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
-		    "list of events to trace"),
+	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
 		    "trace events on existing process id"),
 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
 		    "trace events on existing thread id"),
-	OPT_CALLBACK(0, "filter-pids", &trace, "float",
-		     "show only events with duration > N.M ms", trace__set_filter_pids),
+	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
+		     "pids to filter (by the kernel)", trace__set_filter_pids),
 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 59a98c643240..435b6ca85b1f 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -610,6 +610,11 @@ ifdef LIBBABELTRACE
   endif
 endif
 
+ifndef NO_AUXTRACE
+  $(call detected,CONFIG_AUXTRACE)
+  CFLAGS += -DHAVE_AUXTRACE_SUPPORT
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   template_dir
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index e14bb637255c..aa79fb8a16d4 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -54,12 +54,17 @@ struct record_opts {
 	bool	     period;
 	bool	     sample_intr_regs;
 	bool	     running_time;
+	bool	     full_auxtrace;
+	bool	     auxtrace_snapshot_mode;
 	unsigned int freq;
 	unsigned int mmap_pages;
+	unsigned int auxtrace_mmap_pages;
 	unsigned int user_freq;
 	u64          branch_stack;
 	u64	     default_interval;
 	u64	     user_interval;
+	size_t	     auxtrace_snapshot_size;
+	const char   *auxtrace_snapshot_opts;
 	bool	     sample_transaction;
 	unsigned     initial_delay;
 	bool         use_clockid;
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index f671ec37a7c4..ca0e480e741b 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -482,7 +482,7 @@ static int do_test_code_reading(bool try_kcore)
 		else
 			str = "cycles";
 		pr_debug("Parsing event '%s'\n", str);
-		ret = parse_events(evlist, str);
+		ret = parse_events(evlist, str, NULL);
 		if (ret < 0) {
 			pr_debug("parse_events failed\n");
 			goto out_err;
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index b8d8341b383e..3fa715987a5e 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -23,7 +23,7 @@ static int perf_evsel__roundtrip_cache_name_test(void)
 			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
 				__perf_evsel__hw_cache_type_op_res_name(type, op, i,
 									name, sizeof(name));
-				err = parse_events(evlist, name);
+				err = parse_events(evlist, name, NULL);
 				if (err)
 					ret = err;
 			}
@@ -71,7 +71,7 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names)
                 return -ENOMEM;
 
 	for (i = 0; i < nr_names; ++i) {
-		err = parse_events(evlist, names[i]);
+		err = parse_events(evlist, names[i], NULL);
 		if (err) {
 			pr_debug("failed to parse event '%s', err %d\n",
 				 names[i], err);
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 18619966454c..b08a95a5ca1a 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -695,7 +695,7 @@ int test__hists_cumulate(void)
 
 	TEST_ASSERT_VAL("No memory", evlist);
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index 59e53db7914c..108488cd71fa 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -108,10 +108,10 @@ int test__hists_filter(void)
 
 	TEST_ASSERT_VAL("No memory", evlist);
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
-	err = parse_events(evlist, "task-clock");
+	err = parse_events(evlist, "task-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 278ba8344c23..34c61e4d3352 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -282,10 +282,10 @@ int test__hists_link(void)
 	if (evlist == NULL)
                 return -ENOMEM;
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
-	err = parse_events(evlist, "task-clock");
+	err = parse_events(evlist, "task-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index b52c9faea224..d8a23db80094 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -590,7 +590,7 @@ int test__hists_output(void)
 
 	TEST_ASSERT_VAL("No memory", evlist);
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index 7a5ab7b0b8f6..5b171d1e338b 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -78,8 +78,8 @@ int test__keep_tracking(void)
 
 	perf_evlist__set_maps(evlist, cpus, threads);
 
-	CHECK__(parse_events(evlist, "dummy:u"));
-	CHECK__(parse_events(evlist, "cycles:u"));
+	CHECK__(parse_events(evlist, "dummy:u", NULL));
+	CHECK__(parse_events(evlist, "cycles:u", NULL));
 
 	perf_evlist__config(evlist, &opts);
 
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index bff85324f799..65280d28662e 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -32,6 +32,7 @@ make_no_backtrace   := NO_BACKTRACE=1
 make_no_libnuma     := NO_LIBNUMA=1
 make_no_libaudit    := NO_LIBAUDIT=1
 make_no_libbionic   := NO_LIBBIONIC=1
+make_no_auxtrace    := NO_AUXTRACE=1
 make_tags           := tags
 make_cscope         := cscope
 make_help           := help
@@ -52,7 +53,7 @@ make_static         := LDFLAGS=-static
 make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
 make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
-make_minimal        += NO_LIBDW_DWARF_UNWIND=1
+make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1
 
 # $(run) contains all available tests
 run := make_pure
@@ -74,6 +75,7 @@ run += make_no_backtrace
 run += make_no_libnuma
 run += make_no_libaudit
 run += make_no_libbionic
+run += make_no_auxtrace
 run += make_help
 run += make_doc
 run += make_perf_o
@@ -223,7 +225,19 @@ tarpkg:
 	echo "- $@: $$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1
 
-all: $(run) $(run_O) tarpkg
+make_kernelsrc:
+	@echo " - make -C <kernelsrc> tools/perf"
+	$(call clean); \
+	(make -C ../.. tools/perf) > $@ 2>&1 && \
+	test -x perf && rm -f $@ || (cat $@ ; false)
+
+make_kernelsrc_tools:
+	@echo " - make -C <kernelsrc>/tools perf"
+	$(call clean); \
+	(make -C ../../tools perf) > $@ 2>&1 && \
+	test -x perf && rm -f $@ || (cat $@ ; false)
+
+all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
 	@echo OK
 
 out: $(run_O)
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 3de744961739..82d2a1636f7f 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1571,7 +1571,7 @@ static int test_event(struct evlist_test *e)
 	if (evlist == NULL)
 		return -ENOMEM;
 
-	ret = parse_events(evlist, e->name);
+	ret = parse_events(evlist, e->name, NULL);
 	if (ret) {
 		pr_debug("failed to parse event '%s', err %d\n",
 			 e->name, ret);
diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c
index f238442b238a..5f49484f1abc 100644
--- a/tools/perf/tests/perf-time-to-tsc.c
+++ b/tools/perf/tests/perf-time-to-tsc.c
@@ -68,7 +68,7 @@ int test__perf_time_to_tsc(void)
 
 	perf_evlist__set_maps(evlist, cpus, threads);
 
-	CHECK__(parse_events(evlist, "cycles:u"));
+	CHECK__(parse_events(evlist, "cycles:u", NULL));
 
 	perf_evlist__config(evlist, &opts);
 
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index eeb68bb1972d..faa04e9d5d5f 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -152,7 +152,8 @@ int test__pmu(void)
 		if (ret)
 			break;
 
-		ret = perf_pmu__config_terms(&formats, &attr, terms, false);
+		ret = perf_pmu__config_terms(&formats, &attr, terms,
+					     false, NULL);
 		if (ret)
 			break;
 
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index cc68648c7c55..0d31403ea593 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -347,7 +347,7 @@ int test__switch_tracking(void)
 	perf_evlist__set_maps(evlist, cpus, threads);
 
 	/* First event */
-	err = parse_events(evlist, "cpu-clock:u");
+	err = parse_events(evlist, "cpu-clock:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event dummy:u\n");
 		goto out_err;
@@ -356,7 +356,7 @@ int test__switch_tracking(void)
 	cpu_clocks_evsel = perf_evlist__last(evlist);
 
 	/* Second event */
-	err = parse_events(evlist, "cycles:u");
+	err = parse_events(evlist, "cycles:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event cycles:u\n");
 		goto out_err;
@@ -371,7 +371,7 @@ int test__switch_tracking(void)
 		goto out;
 	}
 
-	err = parse_events(evlist, sched_switch);
+	err = parse_events(evlist, sched_switch, NULL);
 	if (err) {
 		pr_debug("Failed to parse event %s\n", sched_switch);
 		goto out_err;
@@ -401,7 +401,7 @@ int test__switch_tracking(void)
 	perf_evsel__set_sample_bit(cycles_evsel, TIME);
 
 	/* Fourth event */
-	err = parse_events(evlist, "dummy:u");
+	err = parse_events(evlist, "dummy:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event dummy:u\n");
 		goto out_err;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 995b7a8596b1..f981cb8f0158 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -25,6 +25,9 @@ struct hist_browser {
 	struct hists	    *hists;
 	struct hist_entry   *he_selection;
 	struct map_symbol   *selection;
+	struct hist_browser_timer *hbt;
+	struct pstack	    *pstack;
+	struct perf_session_env *env;
 	int		     print_seq;
 	bool		     show_dso;
 	bool		     show_headers;
@@ -60,7 +63,7 @@ static int hist_browser__get_folding(struct hist_browser *browser)
 		struct hist_entry *he =
 			rb_entry(nd, struct hist_entry, rb_node);
 
-		if (he->ms.unfolded)
+		if (he->unfolded)
 			unfolded_rows += he->nr_rows;
 	}
 	return unfolded_rows;
@@ -136,24 +139,19 @@ static char tree__folded_sign(bool unfolded)
 	return unfolded ? '-' : '+';
 }
 
-static char map_symbol__folded(const struct map_symbol *ms)
-{
-	return ms->has_children ? tree__folded_sign(ms->unfolded) : ' ';
-}
-
 static char hist_entry__folded(const struct hist_entry *he)
 {
-	return map_symbol__folded(&he->ms);
+	return he->has_children ? tree__folded_sign(he->unfolded) : ' ';
 }
 
 static char callchain_list__folded(const struct callchain_list *cl)
 {
-	return map_symbol__folded(&cl->ms);
+	return cl->has_children ? tree__folded_sign(cl->unfolded) : ' ';
 }
 
-static void map_symbol__set_folding(struct map_symbol *ms, bool unfold)
+static void callchain_list__set_folding(struct callchain_list *cl, bool unfold)
 {
-	ms->unfolded = unfold ? ms->has_children : false;
+	cl->unfolded = unfold ? cl->has_children : false;
 }
 
 static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
@@ -189,7 +187,7 @@ static int callchain_node__count_rows(struct callchain_node *node)
 
 	list_for_each_entry(chain, &node->val, list) {
 		++n;
-		unfolded = chain->ms.unfolded;
+		unfolded = chain->unfolded;
 	}
 
 	if (unfolded)
@@ -211,15 +209,27 @@ static int callchain__count_rows(struct rb_root *chain)
 	return n;
 }
 
-static bool map_symbol__toggle_fold(struct map_symbol *ms)
+static bool hist_entry__toggle_fold(struct hist_entry *he)
 {
-	if (!ms)
+	if (!he)
 		return false;
 
-	if (!ms->has_children)
+	if (!he->has_children)
 		return false;
 
-	ms->unfolded = !ms->unfolded;
+	he->unfolded = !he->unfolded;
+	return true;
+}
+
+static bool callchain_list__toggle_fold(struct callchain_list *cl)
+{
+	if (!cl)
+		return false;
+
+	if (!cl->has_children)
+		return false;
+
+	cl->unfolded = !cl->unfolded;
 	return true;
 }
 
@@ -235,10 +245,10 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *no
 		list_for_each_entry(chain, &child->val, list) {
 			if (first) {
 				first = false;
-				chain->ms.has_children = chain->list.next != &child->val ||
+				chain->has_children = chain->list.next != &child->val ||
 							 !RB_EMPTY_ROOT(&child->rb_root);
 			} else
-				chain->ms.has_children = chain->list.next == &child->val &&
+				chain->has_children = chain->list.next == &child->val &&
 							 !RB_EMPTY_ROOT(&child->rb_root);
 		}
 
@@ -252,11 +262,11 @@ static void callchain_node__init_have_children(struct callchain_node *node,
 	struct callchain_list *chain;
 
 	chain = list_entry(node->val.next, struct callchain_list, list);
-	chain->ms.has_children = has_sibling;
+	chain->has_children = has_sibling;
 
 	if (!list_empty(&node->val)) {
 		chain = list_entry(node->val.prev, struct callchain_list, list);
-		chain->ms.has_children = !RB_EMPTY_ROOT(&node->rb_root);
+		chain->has_children = !RB_EMPTY_ROOT(&node->rb_root);
 	}
 
 	callchain_node__init_have_children_rb_tree(node);
@@ -276,7 +286,7 @@ static void callchain__init_have_children(struct rb_root *root)
 static void hist_entry__init_have_children(struct hist_entry *he)
 {
 	if (!he->init_have_children) {
-		he->ms.has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
+		he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
 		callchain__init_have_children(&he->sorted_chain);
 		he->init_have_children = true;
 	}
@@ -284,14 +294,22 @@ static void hist_entry__init_have_children(struct hist_entry *he)
 
 static bool hist_browser__toggle_fold(struct hist_browser *browser)
 {
-	if (map_symbol__toggle_fold(browser->selection)) {
-		struct hist_entry *he = browser->he_selection;
+	struct hist_entry *he = browser->he_selection;
+	struct map_symbol *ms = browser->selection;
+	struct callchain_list *cl = container_of(ms, struct callchain_list, ms);
+	bool has_children;
 
+	if (ms == &he->ms)
+		has_children = hist_entry__toggle_fold(he);
+	else
+		has_children = callchain_list__toggle_fold(cl);
+
+	if (has_children) {
 		hist_entry__init_have_children(he);
 		browser->b.nr_entries -= he->nr_rows;
 		browser->nr_callchain_rows -= he->nr_rows;
 
-		if (he->ms.unfolded)
+		if (he->unfolded)
 			he->nr_rows = callchain__count_rows(&he->sorted_chain);
 		else
 			he->nr_rows = 0;
@@ -318,8 +336,8 @@ static int callchain_node__set_folding_rb_tree(struct callchain_node *node, bool
 
 		list_for_each_entry(chain, &child->val, list) {
 			++n;
-			map_symbol__set_folding(&chain->ms, unfold);
-			has_children = chain->ms.has_children;
+			callchain_list__set_folding(chain, unfold);
+			has_children = chain->has_children;
 		}
 
 		if (has_children)
@@ -337,8 +355,8 @@ static int callchain_node__set_folding(struct callchain_node *node, bool unfold)
 
 	list_for_each_entry(chain, &node->val, list) {
 		++n;
-		map_symbol__set_folding(&chain->ms, unfold);
-		has_children = chain->ms.has_children;
+		callchain_list__set_folding(chain, unfold);
+		has_children = chain->has_children;
 	}
 
 	if (has_children)
@@ -363,9 +381,9 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
 static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
 {
 	hist_entry__init_have_children(he);
-	map_symbol__set_folding(&he->ms, unfold);
+	he->unfolded = unfold ? he->has_children : false;
 
-	if (he->ms.has_children) {
+	if (he->has_children) {
 		int n = callchain__set_folding(&he->sorted_chain, unfold);
 		he->nr_rows = unfold ? n : 0;
 	} else
@@ -406,11 +424,11 @@ static void ui_browser__warn_lost_events(struct ui_browser *browser)
 		"Or reduce the sampling frequency.");
 }
 
-static int hist_browser__run(struct hist_browser *browser,
-			     struct hist_browser_timer *hbt)
+static int hist_browser__run(struct hist_browser *browser)
 {
 	int key;
 	char title[160];
+	struct hist_browser_timer *hbt = browser->hbt;
 	int delay_secs = hbt ? hbt->refresh : 0;
 
 	browser->b.entries = &browser->hists->entries;
@@ -1016,7 +1034,7 @@ do_offset:
 	if (offset > 0) {
 		do {
 			h = rb_entry(nd, struct hist_entry, rb_node);
-			if (h->ms.unfolded) {
+			if (h->unfolded) {
 				u16 remaining = h->nr_rows - h->row_offset;
 				if (offset > remaining) {
 					offset -= remaining;
@@ -1037,7 +1055,7 @@ do_offset:
 	} else if (offset < 0) {
 		while (1) {
 			h = rb_entry(nd, struct hist_entry, rb_node);
-			if (h->ms.unfolded) {
+			if (h->unfolded) {
 				if (first) {
 					if (-offset > h->row_offset) {
 						offset += h->row_offset;
@@ -1074,7 +1092,7 @@ do_offset:
 				 * row_offset at its last entry.
 				 */
 				h = rb_entry(nd, struct hist_entry, rb_node);
-				if (h->ms.unfolded)
+				if (h->unfolded)
 					h->row_offset = h->nr_rows;
 				break;
 			}
@@ -1195,7 +1213,9 @@ static int hist_browser__dump(struct hist_browser *browser)
 	return 0;
 }
 
-static struct hist_browser *hist_browser__new(struct hists *hists)
+static struct hist_browser *hist_browser__new(struct hists *hists,
+					      struct hist_browser_timer *hbt,
+					      struct perf_session_env *env)
 {
 	struct hist_browser *browser = zalloc(sizeof(*browser));
 
@@ -1206,6 +1226,8 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
 		browser->b.seek = ui_browser__hists_seek;
 		browser->b.use_navkeypressed = true;
 		browser->show_headers = symbol_conf.show_hist_headers;
+		browser->hbt = hbt;
+		browser->env = env;
 	}
 
 	return browser;
@@ -1395,6 +1417,257 @@ close_file_and_continue:
 	return ret;
 }
 
+struct popup_action {
+	struct thread 		*thread;
+	struct dso		*dso;
+	struct map_symbol 	ms;
+
+	int (*fn)(struct hist_browser *browser, struct popup_action *act);
+};
+
+static int
+do_annotate(struct hist_browser *browser, struct popup_action *act)
+{
+	struct perf_evsel *evsel;
+	struct annotation *notes;
+	struct hist_entry *he;
+	int err;
+
+	if (!objdump_path && perf_session_env__lookup_objdump(browser->env))
+		return 0;
+
+	notes = symbol__annotation(act->ms.sym);
+	if (!notes->src)
+		return 0;
+
+	evsel = hists_to_evsel(browser->hists);
+	err = map_symbol__tui_annotate(&act->ms, evsel, browser->hbt);
+	he = hist_browser__selected_entry(browser);
+	/*
+	 * offer option to annotate the other branch source or target
+	 * (if they exists) when returning from annotate
+	 */
+	if ((err == 'q' || err == CTRL('c')) && he->branch_info)
+		return 1;
+
+	ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
+	if (err)
+		ui_browser__handle_resize(&browser->b);
+	return 0;
+}
+
+static int
+add_annotate_opt(struct hist_browser *browser __maybe_unused,
+		 struct popup_action *act, char **optstr,
+		 struct map *map, struct symbol *sym)
+{
+	if (sym == NULL || map->dso->annotate_warned)
+		return 0;
+
+	if (asprintf(optstr, "Annotate %s", sym->name) < 0)
+		return 0;
+
+	act->ms.map = map;
+	act->ms.sym = sym;
+	act->fn = do_annotate;
+	return 1;
+}
+
+static int
+do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
+{
+	struct thread *thread = act->thread;
+
+	if (browser->hists->thread_filter) {
+		pstack__remove(browser->pstack, &browser->hists->thread_filter);
+		perf_hpp__set_elide(HISTC_THREAD, false);
+		thread__zput(browser->hists->thread_filter);
+		ui_helpline__pop();
+	} else {
+		ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
+				   thread->comm_set ? thread__comm_str(thread) : "",
+				   thread->tid);
+		browser->hists->thread_filter = thread__get(thread);
+		perf_hpp__set_elide(HISTC_THREAD, false);
+		pstack__push(browser->pstack, &browser->hists->thread_filter);
+	}
+
+	hists__filter_by_thread(browser->hists);
+	hist_browser__reset(browser);
+	return 0;
+}
+
+static int
+add_thread_opt(struct hist_browser *browser, struct popup_action *act,
+	       char **optstr, struct thread *thread)
+{
+	if (thread == NULL)
+		return 0;
+
+	if (asprintf(optstr, "Zoom %s %s(%d) thread",
+		     browser->hists->thread_filter ? "out of" : "into",
+		     thread->comm_set ? thread__comm_str(thread) : "",
+		     thread->tid) < 0)
+		return 0;
+
+	act->thread = thread;
+	act->fn = do_zoom_thread;
+	return 1;
+}
+
+static int
+do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
+{
+	struct dso *dso = act->dso;
+
+	if (browser->hists->dso_filter) {
+		pstack__remove(browser->pstack, &browser->hists->dso_filter);
+		perf_hpp__set_elide(HISTC_DSO, false);
+		browser->hists->dso_filter = NULL;
+		ui_helpline__pop();
+	} else {
+		if (dso == NULL)
+			return 0;
+		ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
+				   dso->kernel ? "the Kernel" : dso->short_name);
+		browser->hists->dso_filter = dso;
+		perf_hpp__set_elide(HISTC_DSO, true);
+		pstack__push(browser->pstack, &browser->hists->dso_filter);
+	}
+
+	hists__filter_by_dso(browser->hists);
+	hist_browser__reset(browser);
+	return 0;
+}
+
+static int
+add_dso_opt(struct hist_browser *browser, struct popup_action *act,
+	    char **optstr, struct dso *dso)
+{
+	if (dso == NULL)
+		return 0;
+
+	if (asprintf(optstr, "Zoom %s %s DSO",
+		     browser->hists->dso_filter ? "out of" : "into",
+		     dso->kernel ? "the Kernel" : dso->short_name) < 0)
+		return 0;
+
+	act->dso = dso;
+	act->fn = do_zoom_dso;
+	return 1;
+}
+
+static int
+do_browse_map(struct hist_browser *browser __maybe_unused,
+	      struct popup_action *act)
+{
+	map__browse(act->ms.map);
+	return 0;
+}
+
+static int
+add_map_opt(struct hist_browser *browser __maybe_unused,
+	    struct popup_action *act, char **optstr, struct map *map)
+{
+	if (map == NULL)
+		return 0;
+
+	if (asprintf(optstr, "Browse map details") < 0)
+		return 0;
+
+	act->ms.map = map;
+	act->fn = do_browse_map;
+	return 1;
+}
+
+static int
+do_run_script(struct hist_browser *browser __maybe_unused,
+	      struct popup_action *act)
+{
+	char script_opt[64];
+	memset(script_opt, 0, sizeof(script_opt));
+
+	if (act->thread) {
+		scnprintf(script_opt, sizeof(script_opt), " -c %s ",
+			  thread__comm_str(act->thread));
+	} else if (act->ms.sym) {
+		scnprintf(script_opt, sizeof(script_opt), " -S %s ",
+			  act->ms.sym->name);
+	}
+
+	script_browse(script_opt);
+	return 0;
+}
+
+static int
+add_script_opt(struct hist_browser *browser __maybe_unused,
+	       struct popup_action *act, char **optstr,
+	       struct thread *thread, struct symbol *sym)
+{
+	if (thread) {
+		if (asprintf(optstr, "Run scripts for samples of thread [%s]",
+			     thread__comm_str(thread)) < 0)
+			return 0;
+	} else if (sym) {
+		if (asprintf(optstr, "Run scripts for samples of symbol [%s]",
+			     sym->name) < 0)
+			return 0;
+	} else {
+		if (asprintf(optstr, "Run scripts for all samples") < 0)
+			return 0;
+	}
+
+	act->thread = thread;
+	act->ms.sym = sym;
+	act->fn = do_run_script;
+	return 1;
+}
+
+static int
+do_switch_data(struct hist_browser *browser __maybe_unused,
+	       struct popup_action *act __maybe_unused)
+{
+	if (switch_data_file()) {
+		ui__warning("Won't switch the data files due to\n"
+			    "no valid data file get selected!\n");
+		return 0;
+	}
+
+	return K_SWITCH_INPUT_DATA;
+}
+
+static int
+add_switch_opt(struct hist_browser *browser,
+	       struct popup_action *act, char **optstr)
+{
+	if (!is_report_browser(browser->hbt))
+		return 0;
+
+	if (asprintf(optstr, "Switch to another data file in PWD") < 0)
+		return 0;
+
+	act->fn = do_switch_data;
+	return 1;
+}
+
+static int
+do_exit_browser(struct hist_browser *browser __maybe_unused,
+		struct popup_action *act __maybe_unused)
+{
+	return 0;
+}
+
+static int
+add_exit_opt(struct hist_browser *browser __maybe_unused,
+	     struct popup_action *act, char **optstr)
+{
+	if (asprintf(optstr, "Exit") < 0)
+		return 0;
+
+	act->fn = do_exit_browser;
+	return 1;
+}
+
 static void hist_browser__update_nr_entries(struct hist_browser *hb)
 {
 	u64 nr_entries = 0;
@@ -1421,14 +1694,14 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    struct perf_session_env *env)
 {
 	struct hists *hists = evsel__hists(evsel);
-	struct hist_browser *browser = hist_browser__new(hists);
+	struct hist_browser *browser = hist_browser__new(hists, hbt, env);
 	struct branch_info *bi;
-	struct pstack *fstack;
-	char *options[16];
+#define MAX_OPTIONS  16
+	char *options[MAX_OPTIONS];
+	struct popup_action actions[MAX_OPTIONS];
 	int nr_options = 0;
 	int key = -1;
 	char buf[64];
-	char script_opt[64];
 	int delay_secs = hbt ? hbt->refresh : 0;
 	struct perf_hpp_fmt *fmt;
 
@@ -1473,13 +1746,14 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 		hist_browser__update_nr_entries(browser);
 	}
 
-	fstack = pstack__new(2);
-	if (fstack == NULL)
+	browser->pstack = pstack__new(2);
+	if (browser->pstack == NULL)
 		goto out;
 
 	ui_helpline__push(helpline);
 
 	memset(options, 0, sizeof(options));
+	memset(actions, 0, sizeof(actions));
 
 	perf_hpp__for_each_format(fmt)
 		perf_hpp__reset_width(fmt, hists);
@@ -1489,16 +1763,12 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 
 	while (1) {
 		struct thread *thread = NULL;
-		const struct dso *dso = NULL;
-		int choice = 0,
-		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
-		    annotate_f = -2, annotate_t = -2, browse_map = -2;
-		int scripts_comm = -2, scripts_symbol = -2,
-		    scripts_all = -2, switch_data = -2;
+		struct dso *dso = NULL;
+		int choice = 0;
 
 		nr_options = 0;
 
-		key = hist_browser__run(browser, hbt);
+		key = hist_browser__run(browser);
 
 		if (browser->he_selection != NULL) {
 			thread = hist_browser__selected_thread(browser);
@@ -1526,17 +1796,25 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			    browser->selection->sym == NULL ||
 			    browser->selection->map->dso->annotate_warned)
 				continue;
-			goto do_annotate;
+
+			actions->ms.map = browser->selection->map;
+			actions->ms.sym = browser->selection->sym;
+			do_annotate(browser, actions);
+			continue;
 		case 'P':
 			hist_browser__dump(browser);
 			continue;
 		case 'd':
-			goto zoom_dso;
+			actions->dso = dso;
+			do_zoom_dso(browser, actions);
+			continue;
 		case 'V':
 			browser->show_dso = !browser->show_dso;
 			continue;
 		case 't':
-			goto zoom_thread;
+			actions->thread = thread;
+			do_zoom_thread(browser, actions);
+			continue;
 		case '/':
 			if (ui_browser__input_window("Symbol to show",
 					"Please enter the name of symbol you want to see",
@@ -1548,12 +1826,18 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			}
 			continue;
 		case 'r':
-			if (is_report_browser(hbt))
-				goto do_scripts;
+			if (is_report_browser(hbt)) {
+				actions->thread = NULL;
+				actions->ms.sym = NULL;
+				do_run_script(browser, actions);
+			}
 			continue;
 		case 's':
-			if (is_report_browser(hbt))
-				goto do_data_switch;
+			if (is_report_browser(hbt)) {
+				key = do_switch_data(browser, actions);
+				if (key == K_SWITCH_INPUT_DATA)
+					goto out_free_stack;
+			}
 			continue;
 		case 'i':
 			/* env->arch is NULL for live-mode (i.e. perf top) */
@@ -1583,7 +1867,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 		case K_LEFT: {
 			const void *top;
 
-			if (pstack__empty(fstack)) {
+			if (pstack__empty(browser->pstack)) {
 				/*
 				 * Go back to the perf_evsel_menu__run or other user
 				 */
@@ -1591,11 +1875,17 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 					goto out_free_stack;
 				continue;
 			}
-			top = pstack__pop(fstack);
-			if (top == &browser->hists->dso_filter)
-				goto zoom_out_dso;
+			top = pstack__peek(browser->pstack);
+			if (top == &browser->hists->dso_filter) {
+				/*
+				 * No need to set actions->dso here since
+				 * it's just to remove the current filter.
+				 * Ditto for thread below.
+				 */
+				do_zoom_dso(browser, actions);
+			}
 			if (top == &browser->hists->thread_filter)
-				goto zoom_out_thread;
+				do_zoom_thread(browser, actions);
 			continue;
 		}
 		case K_ESC:
@@ -1623,196 +1913,71 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			if (bi == NULL)
 				goto skip_annotation;
 
-			if (bi->from.sym != NULL &&
-			    !bi->from.map->dso->annotate_warned &&
-			    asprintf(&options[nr_options], "Annotate %s", bi->from.sym->name) > 0) {
-				annotate_f = nr_options++;
-			}
-
-			if (bi->to.sym != NULL &&
-			    !bi->to.map->dso->annotate_warned &&
-			    (bi->to.sym != bi->from.sym ||
-			     bi->to.map->dso != bi->from.map->dso) &&
-			    asprintf(&options[nr_options], "Annotate %s", bi->to.sym->name) > 0) {
-				annotate_t = nr_options++;
-			}
+			nr_options += add_annotate_opt(browser,
+						       &actions[nr_options],
+						       &options[nr_options],
+						       bi->from.map,
+						       bi->from.sym);
+			if (bi->to.sym != bi->from.sym)
+				nr_options += add_annotate_opt(browser,
+							&actions[nr_options],
+							&options[nr_options],
+							bi->to.map,
+							bi->to.sym);
 		} else {
-			if (browser->selection->sym != NULL &&
-			    !browser->selection->map->dso->annotate_warned) {
-				struct annotation *notes;
-
-				notes = symbol__annotation(browser->selection->sym);
-
-				if (notes->src &&
-				    asprintf(&options[nr_options], "Annotate %s",
-						 browser->selection->sym->name) > 0) {
-					annotate = nr_options++;
-				}
-			}
+			nr_options += add_annotate_opt(browser,
+						       &actions[nr_options],
+						       &options[nr_options],
+						       browser->selection->map,
+						       browser->selection->sym);
 		}
 skip_annotation:
-		if (thread != NULL &&
-		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
-			     (browser->hists->thread_filter ? "out of" : "into"),
-			     (thread->comm_set ? thread__comm_str(thread) : ""),
-			     thread->tid) > 0)
-			zoom_thread = nr_options++;
-
-		if (dso != NULL &&
-		    asprintf(&options[nr_options], "Zoom %s %s DSO",
-			     (browser->hists->dso_filter ? "out of" : "into"),
-			     (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
-			zoom_dso = nr_options++;
-
-		if (browser->selection != NULL &&
-		    browser->selection->map != NULL &&
-		    asprintf(&options[nr_options], "Browse map details") > 0)
-			browse_map = nr_options++;
+		nr_options += add_thread_opt(browser, &actions[nr_options],
+					     &options[nr_options], thread);
+		nr_options += add_dso_opt(browser, &actions[nr_options],
+					  &options[nr_options], dso);
+		nr_options += add_map_opt(browser, &actions[nr_options],
+					  &options[nr_options],
+					  browser->selection->map);
 
 		/* perf script support */
 		if (browser->he_selection) {
-			struct symbol *sym;
-
-			if (asprintf(&options[nr_options], "Run scripts for samples of thread [%s]",
-				     thread__comm_str(browser->he_selection->thread)) > 0)
-				scripts_comm = nr_options++;
-
-			sym = browser->he_selection->ms.sym;
-			if (sym && sym->namelen &&
-				asprintf(&options[nr_options], "Run scripts for samples of symbol [%s]",
-						sym->name) > 0)
-				scripts_symbol = nr_options++;
+			nr_options += add_script_opt(browser,
+						     &actions[nr_options],
+						     &options[nr_options],
+						     thread, NULL);
+			nr_options += add_script_opt(browser,
+						     &actions[nr_options],
+						     &options[nr_options],
+						     NULL, browser->selection->sym);
 		}
-
-		if (asprintf(&options[nr_options], "Run scripts for all samples") > 0)
-			scripts_all = nr_options++;
-
-		if (is_report_browser(hbt) && asprintf(&options[nr_options],
-				"Switch to another data file in PWD") > 0)
-			switch_data = nr_options++;
+		nr_options += add_script_opt(browser, &actions[nr_options],
+					     &options[nr_options], NULL, NULL);
+		nr_options += add_switch_opt(browser, &actions[nr_options],
+					     &options[nr_options]);
 add_exit_option:
-		options[nr_options++] = (char *)"Exit";
-retry_popup_menu:
-		choice = ui__popup_menu(nr_options, options);
-
-		if (choice == nr_options - 1)
-			break;
+		nr_options += add_exit_opt(browser, &actions[nr_options],
+					   &options[nr_options]);
 
-		if (choice == -1) {
-			free_popup_options(options, nr_options - 1);
-			continue;
-		}
-
-		if (choice == annotate || choice == annotate_t || choice == annotate_f) {
-			struct hist_entry *he;
-			struct annotation *notes;
-			struct map_symbol ms;
-			int err;
-do_annotate:
-			if (!objdump_path && perf_session_env__lookup_objdump(env))
-				continue;
-
-			he = hist_browser__selected_entry(browser);
-			if (he == NULL)
-				continue;
-
-			if (choice == annotate_f) {
-				ms.map = he->branch_info->from.map;
-				ms.sym = he->branch_info->from.sym;
-			} else if (choice == annotate_t) {
-				ms.map = he->branch_info->to.map;
-				ms.sym = he->branch_info->to.sym;
-			} else {
-				ms = *browser->selection;
-			}
-
-			notes = symbol__annotation(ms.sym);
-			if (!notes->src)
-				continue;
-
-			err = map_symbol__tui_annotate(&ms, evsel, hbt);
-			/*
-			 * offer option to annotate the other branch source or target
-			 * (if they exists) when returning from annotate
-			 */
-			if ((err == 'q' || err == CTRL('c'))
-			    && annotate_t != -2 && annotate_f != -2)
-				goto retry_popup_menu;
-
-			ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
-			if (err)
-				ui_browser__handle_resize(&browser->b);
-
-		} else if (choice == browse_map)
-			map__browse(browser->selection->map);
-		else if (choice == zoom_dso) {
-zoom_dso:
-			if (browser->hists->dso_filter) {
-				pstack__remove(fstack, &browser->hists->dso_filter);
-zoom_out_dso:
-				ui_helpline__pop();
-				browser->hists->dso_filter = NULL;
-				perf_hpp__set_elide(HISTC_DSO, false);
-			} else {
-				if (dso == NULL)
-					continue;
-				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
-						   dso->kernel ? "the Kernel" : dso->short_name);
-				browser->hists->dso_filter = dso;
-				perf_hpp__set_elide(HISTC_DSO, true);
-				pstack__push(fstack, &browser->hists->dso_filter);
-			}
-			hists__filter_by_dso(hists);
-			hist_browser__reset(browser);
-		} else if (choice == zoom_thread) {
-zoom_thread:
-			if (browser->hists->thread_filter) {
-				pstack__remove(fstack, &browser->hists->thread_filter);
-zoom_out_thread:
-				ui_helpline__pop();
-				thread__zput(browser->hists->thread_filter);
-				perf_hpp__set_elide(HISTC_THREAD, false);
-			} else {
-				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
-						   thread->comm_set ? thread__comm_str(thread) : "",
-						   thread->tid);
-				browser->hists->thread_filter = thread__get(thread);
-				perf_hpp__set_elide(HISTC_THREAD, false);
-				pstack__push(fstack, &browser->hists->thread_filter);
-			}
-			hists__filter_by_thread(hists);
-			hist_browser__reset(browser);
-		}
-		/* perf scripts support */
-		else if (choice == scripts_all || choice == scripts_comm ||
-				choice == scripts_symbol) {
-do_scripts:
-			memset(script_opt, 0, 64);
+		do {
+			struct popup_action *act;
 
-			if (choice == scripts_comm)
-				sprintf(script_opt, " -c %s ", thread__comm_str(browser->he_selection->thread));
+			choice = ui__popup_menu(nr_options, options);
+			if (choice == -1 || choice >= nr_options)
+				break;
 
-			if (choice == scripts_symbol)
-				sprintf(script_opt, " -S %s ", browser->he_selection->ms.sym->name);
+			act = &actions[choice];
+			key = act->fn(browser, act);
+		} while (key == 1);
 
-			script_browse(script_opt);
-		}
-		/* Switch to another data file */
-		else if (choice == switch_data) {
-do_data_switch:
-			if (!switch_data_file()) {
-				key = K_SWITCH_INPUT_DATA;
-				break;
-			} else
-				ui__warning("Won't switch the data files due to\n"
-					"no valid data file get selected!\n");
-		}
+		if (key == K_SWITCH_INPUT_DATA)
+			break;
 	}
 out_free_stack:
-	pstack__delete(fstack);
+	pstack__delete(browser->pstack);
 out:
 	hist_browser__delete(browser);
-	free_popup_options(options, nr_options - 1);
+	free_popup_options(options, MAX_OPTIONS);
 	return key;
 }
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 797490a40075..d552203aead0 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -74,6 +74,7 @@ libperf-y += data.o
 libperf-$(CONFIG_X86) += tsc.o
 libperf-y += cloexec.o
 libperf-y += thread-stack.o
+libperf-$(CONFIG_AUXTRACE) += auxtrace.o
 
 libperf-$(CONFIG_LIBELF) += symbol-elf.o
 libperf-$(CONFIG_LIBELF) += probe-event.o
@@ -117,7 +118,7 @@ $(OUTPUT)util/pmu-bison.c: util/pmu.y
 
 CFLAGS_parse-events-flex.o  += -w
 CFLAGS_pmu-flex.o           += -w
-CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w
 CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
new file mode 100644
index 000000000000..df66966cfde7
--- /dev/null
+++ b/tools/perf/util/auxtrace.c
@@ -0,0 +1,1352 @@
+/*
+ * auxtrace.c: AUX area trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+#include <sys/param.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <linux/list.h>
+
+#include "../perf.h"
+#include "util.h"
+#include "evlist.h"
+#include "cpumap.h"
+#include "thread_map.h"
+#include "asm/bug.h"
+#include "auxtrace.h"
+
+#include <linux/hash.h>
+
+#include "event.h"
+#include "session.h"
+#include "debug.h"
+#include "parse-options.h"
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+			struct auxtrace_mmap_params *mp,
+			void *userpg, int fd)
+{
+	struct perf_event_mmap_page *pc = userpg;
+
+#if BITS_PER_LONG != 64 && !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	pr_err("Cannot use AUX area tracing mmaps\n");
+	return -1;
+#endif
+
+	WARN_ONCE(mm->base, "Uninitialized auxtrace_mmap\n");
+
+	mm->userpg = userpg;
+	mm->mask = mp->mask;
+	mm->len = mp->len;
+	mm->prev = 0;
+	mm->idx = mp->idx;
+	mm->tid = mp->tid;
+	mm->cpu = mp->cpu;
+
+	if (!mp->len) {
+		mm->base = NULL;
+		return 0;
+	}
+
+	pc->aux_offset = mp->offset;
+	pc->aux_size = mp->len;
+
+	mm->base = mmap(NULL, mp->len, mp->prot, MAP_SHARED, fd, mp->offset);
+	if (mm->base == MAP_FAILED) {
+		pr_debug2("failed to mmap AUX area\n");
+		mm->base = NULL;
+		return -1;
+	}
+
+	return 0;
+}
+
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm)
+{
+	if (mm->base) {
+		munmap(mm->base, mm->len);
+		mm->base = NULL;
+	}
+}
+
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+				off_t auxtrace_offset,
+				unsigned int auxtrace_pages,
+				bool auxtrace_overwrite)
+{
+	if (auxtrace_pages) {
+		mp->offset = auxtrace_offset;
+		mp->len = auxtrace_pages * (size_t)page_size;
+		mp->mask = is_power_of_2(mp->len) ? mp->len - 1 : 0;
+		mp->prot = PROT_READ | (auxtrace_overwrite ? 0 : PROT_WRITE);
+		pr_debug2("AUX area mmap length %zu\n", mp->len);
+	} else {
+		mp->len = 0;
+	}
+}
+
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+				   struct perf_evlist *evlist, int idx,
+				   bool per_cpu)
+{
+	mp->idx = idx;
+
+	if (per_cpu) {
+		mp->cpu = evlist->cpus->map[idx];
+		if (evlist->threads)
+			mp->tid = evlist->threads->map[0];
+		else
+			mp->tid = -1;
+	} else {
+		mp->cpu = -1;
+		mp->tid = evlist->threads->map[idx];
+	}
+}
+
+#define AUXTRACE_INIT_NR_QUEUES	32
+
+static struct auxtrace_queue *auxtrace_alloc_queue_array(unsigned int nr_queues)
+{
+	struct auxtrace_queue *queue_array;
+	unsigned int max_nr_queues, i;
+
+	max_nr_queues = UINT_MAX / sizeof(struct auxtrace_queue);
+	if (nr_queues > max_nr_queues)
+		return NULL;
+
+	queue_array = calloc(nr_queues, sizeof(struct auxtrace_queue));
+	if (!queue_array)
+		return NULL;
+
+	for (i = 0; i < nr_queues; i++) {
+		INIT_LIST_HEAD(&queue_array[i].head);
+		queue_array[i].priv = NULL;
+	}
+
+	return queue_array;
+}
+
+int auxtrace_queues__init(struct auxtrace_queues *queues)
+{
+	queues->nr_queues = AUXTRACE_INIT_NR_QUEUES;
+	queues->queue_array = auxtrace_alloc_queue_array(queues->nr_queues);
+	if (!queues->queue_array)
+		return -ENOMEM;
+	return 0;
+}
+
+static int auxtrace_queues__grow(struct auxtrace_queues *queues,
+				 unsigned int new_nr_queues)
+{
+	unsigned int nr_queues = queues->nr_queues;
+	struct auxtrace_queue *queue_array;
+	unsigned int i;
+
+	if (!nr_queues)
+		nr_queues = AUXTRACE_INIT_NR_QUEUES;
+
+	while (nr_queues && nr_queues < new_nr_queues)
+		nr_queues <<= 1;
+
+	if (nr_queues < queues->nr_queues || nr_queues < new_nr_queues)
+		return -EINVAL;
+
+	queue_array = auxtrace_alloc_queue_array(nr_queues);
+	if (!queue_array)
+		return -ENOMEM;
+
+	for (i = 0; i < queues->nr_queues; i++) {
+		list_splice_tail(&queues->queue_array[i].head,
+				 &queue_array[i].head);
+		queue_array[i].priv = queues->queue_array[i].priv;
+	}
+
+	queues->nr_queues = nr_queues;
+	queues->queue_array = queue_array;
+
+	return 0;
+}
+
+static void *auxtrace_copy_data(u64 size, struct perf_session *session)
+{
+	int fd = perf_data_file__fd(session->file);
+	void *p;
+	ssize_t ret;
+
+	if (size > SSIZE_MAX)
+		return NULL;
+
+	p = malloc(size);
+	if (!p)
+		return NULL;
+
+	ret = readn(fd, p, size);
+	if (ret != (ssize_t)size) {
+		free(p);
+		return NULL;
+	}
+
+	return p;
+}
+
+static int auxtrace_queues__add_buffer(struct auxtrace_queues *queues,
+				       unsigned int idx,
+				       struct auxtrace_buffer *buffer)
+{
+	struct auxtrace_queue *queue;
+	int err;
+
+	if (idx >= queues->nr_queues) {
+		err = auxtrace_queues__grow(queues, idx + 1);
+		if (err)
+			return err;
+	}
+
+	queue = &queues->queue_array[idx];
+
+	if (!queue->set) {
+		queue->set = true;
+		queue->tid = buffer->tid;
+		queue->cpu = buffer->cpu;
+	} else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) {
+		pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n",
+		       queue->cpu, queue->tid, buffer->cpu, buffer->tid);
+		return -EINVAL;
+	}
+
+	buffer->buffer_nr = queues->next_buffer_nr++;
+
+	list_add_tail(&buffer->list, &queue->head);
+
+	queues->new_data = true;
+	queues->populated = true;
+
+	return 0;
+}
+
+/* Limit buffers to 32MiB on 32-bit */
+#define BUFFER_LIMIT_FOR_32_BIT (32 * 1024 * 1024)
+
+static int auxtrace_queues__split_buffer(struct auxtrace_queues *queues,
+					 unsigned int idx,
+					 struct auxtrace_buffer *buffer)
+{
+	u64 sz = buffer->size;
+	bool consecutive = false;
+	struct auxtrace_buffer *b;
+	int err;
+
+	while (sz > BUFFER_LIMIT_FOR_32_BIT) {
+		b = memdup(buffer, sizeof(struct auxtrace_buffer));
+		if (!b)
+			return -ENOMEM;
+		b->size = BUFFER_LIMIT_FOR_32_BIT;
+		b->consecutive = consecutive;
+		err = auxtrace_queues__add_buffer(queues, idx, b);
+		if (err) {
+			auxtrace_buffer__free(b);
+			return err;
+		}
+		buffer->data_offset += BUFFER_LIMIT_FOR_32_BIT;
+		sz -= BUFFER_LIMIT_FOR_32_BIT;
+		consecutive = true;
+	}
+
+	buffer->size = sz;
+	buffer->consecutive = consecutive;
+
+	return 0;
+}
+
+static int auxtrace_queues__add_event_buffer(struct auxtrace_queues *queues,
+					     struct perf_session *session,
+					     unsigned int idx,
+					     struct auxtrace_buffer *buffer)
+{
+	if (session->one_mmap) {
+		buffer->data = buffer->data_offset - session->one_mmap_offset +
+			       session->one_mmap_addr;
+	} else if (perf_data_file__is_pipe(session->file)) {
+		buffer->data = auxtrace_copy_data(buffer->size, session);
+		if (!buffer->data)
+			return -ENOMEM;
+		buffer->data_needs_freeing = true;
+	} else if (BITS_PER_LONG == 32 &&
+		   buffer->size > BUFFER_LIMIT_FOR_32_BIT) {
+		int err;
+
+		err = auxtrace_queues__split_buffer(queues, idx, buffer);
+		if (err)
+			return err;
+	}
+
+	return auxtrace_queues__add_buffer(queues, idx, buffer);
+}
+
+int auxtrace_queues__add_event(struct auxtrace_queues *queues,
+			       struct perf_session *session,
+			       union perf_event *event, off_t data_offset,
+			       struct auxtrace_buffer **buffer_ptr)
+{
+	struct auxtrace_buffer *buffer;
+	unsigned int idx;
+	int err;
+
+	buffer = zalloc(sizeof(struct auxtrace_buffer));
+	if (!buffer)
+		return -ENOMEM;
+
+	buffer->pid = -1;
+	buffer->tid = event->auxtrace.tid;
+	buffer->cpu = event->auxtrace.cpu;
+	buffer->data_offset = data_offset;
+	buffer->offset = event->auxtrace.offset;
+	buffer->reference = event->auxtrace.reference;
+	buffer->size = event->auxtrace.size;
+	idx = event->auxtrace.idx;
+
+	err = auxtrace_queues__add_event_buffer(queues, session, idx, buffer);
+	if (err)
+		goto out_err;
+
+	if (buffer_ptr)
+		*buffer_ptr = buffer;
+
+	return 0;
+
+out_err:
+	auxtrace_buffer__free(buffer);
+	return err;
+}
+
+static int auxtrace_queues__add_indexed_event(struct auxtrace_queues *queues,
+					      struct perf_session *session,
+					      off_t file_offset, size_t sz)
+{
+	union perf_event *event;
+	int err;
+	char buf[PERF_SAMPLE_MAX_SIZE];
+
+	err = perf_session__peek_event(session, file_offset, buf,
+				       PERF_SAMPLE_MAX_SIZE, &event, NULL);
+	if (err)
+		return err;
+
+	if (event->header.type == PERF_RECORD_AUXTRACE) {
+		if (event->header.size < sizeof(struct auxtrace_event) ||
+		    event->header.size != sz) {
+			err = -EINVAL;
+			goto out;
+		}
+		file_offset += event->header.size;
+		err = auxtrace_queues__add_event(queues, session, event,
+						 file_offset, NULL);
+	}
+out:
+	return err;
+}
+
+void auxtrace_queues__free(struct auxtrace_queues *queues)
+{
+	unsigned int i;
+
+	for (i = 0; i < queues->nr_queues; i++) {
+		while (!list_empty(&queues->queue_array[i].head)) {
+			struct auxtrace_buffer *buffer;
+
+			buffer = list_entry(queues->queue_array[i].head.next,
+					    struct auxtrace_buffer, list);
+			list_del(&buffer->list);
+			auxtrace_buffer__free(buffer);
+		}
+	}
+
+	zfree(&queues->queue_array);
+	queues->nr_queues = 0;
+}
+
+static void auxtrace_heapify(struct auxtrace_heap_item *heap_array,
+			     unsigned int pos, unsigned int queue_nr,
+			     u64 ordinal)
+{
+	unsigned int parent;
+
+	while (pos) {
+		parent = (pos - 1) >> 1;
+		if (heap_array[parent].ordinal <= ordinal)
+			break;
+		heap_array[pos] = heap_array[parent];
+		pos = parent;
+	}
+	heap_array[pos].queue_nr = queue_nr;
+	heap_array[pos].ordinal = ordinal;
+}
+
+int auxtrace_heap__add(struct auxtrace_heap *heap, unsigned int queue_nr,
+		       u64 ordinal)
+{
+	struct auxtrace_heap_item *heap_array;
+
+	if (queue_nr >= heap->heap_sz) {
+		unsigned int heap_sz = AUXTRACE_INIT_NR_QUEUES;
+
+		while (heap_sz <= queue_nr)
+			heap_sz <<= 1;
+		heap_array = realloc(heap->heap_array,
+				     heap_sz * sizeof(struct auxtrace_heap_item));
+		if (!heap_array)
+			return -ENOMEM;
+		heap->heap_array = heap_array;
+		heap->heap_sz = heap_sz;
+	}
+
+	auxtrace_heapify(heap->heap_array, heap->heap_cnt++, queue_nr, ordinal);
+
+	return 0;
+}
+
+void auxtrace_heap__free(struct auxtrace_heap *heap)
+{
+	zfree(&heap->heap_array);
+	heap->heap_cnt = 0;
+	heap->heap_sz = 0;
+}
+
+void auxtrace_heap__pop(struct auxtrace_heap *heap)
+{
+	unsigned int pos, last, heap_cnt = heap->heap_cnt;
+	struct auxtrace_heap_item *heap_array;
+
+	if (!heap_cnt)
+		return;
+
+	heap->heap_cnt -= 1;
+
+	heap_array = heap->heap_array;
+
+	pos = 0;
+	while (1) {
+		unsigned int left, right;
+
+		left = (pos << 1) + 1;
+		if (left >= heap_cnt)
+			break;
+		right = left + 1;
+		if (right >= heap_cnt) {
+			heap_array[pos] = heap_array[left];
+			return;
+		}
+		if (heap_array[left].ordinal < heap_array[right].ordinal) {
+			heap_array[pos] = heap_array[left];
+			pos = left;
+		} else {
+			heap_array[pos] = heap_array[right];
+			pos = right;
+		}
+	}
+
+	last = heap_cnt - 1;
+	auxtrace_heapify(heap_array, pos, heap_array[last].queue_nr,
+			 heap_array[last].ordinal);
+}
+
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+{
+	if (itr)
+		return itr->info_priv_size(itr);
+	return 0;
+}
+
+static int auxtrace_not_supported(void)
+{
+	pr_err("AUX area tracing is not supported on this architecture\n");
+	return -EINVAL;
+}
+
+int auxtrace_record__info_fill(struct auxtrace_record *itr,
+			       struct perf_session *session,
+			       struct auxtrace_info_event *auxtrace_info,
+			       size_t priv_size)
+{
+	if (itr)
+		return itr->info_fill(itr, session, auxtrace_info, priv_size);
+	return auxtrace_not_supported();
+}
+
+void auxtrace_record__free(struct auxtrace_record *itr)
+{
+	if (itr)
+		itr->free(itr);
+}
+
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr)
+{
+	if (itr && itr->snapshot_start)
+		return itr->snapshot_start(itr);
+	return 0;
+}
+
+int auxtrace_record__snapshot_finish(struct auxtrace_record *itr)
+{
+	if (itr && itr->snapshot_finish)
+		return itr->snapshot_finish(itr);
+	return 0;
+}
+
+int auxtrace_record__find_snapshot(struct auxtrace_record *itr, int idx,
+				   struct auxtrace_mmap *mm,
+				   unsigned char *data, u64 *head, u64 *old)
+{
+	if (itr && itr->find_snapshot)
+		return itr->find_snapshot(itr, idx, mm, data, head, old);
+	return 0;
+}
+
+int auxtrace_record__options(struct auxtrace_record *itr,
+			     struct perf_evlist *evlist,
+			     struct record_opts *opts)
+{
+	if (itr)
+		return itr->recording_options(itr, evlist, opts);
+	return 0;
+}
+
+u64 auxtrace_record__reference(struct auxtrace_record *itr)
+{
+	if (itr)
+		return itr->reference(itr);
+	return 0;
+}
+
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
+				    struct record_opts *opts, const char *str)
+{
+	if (!str)
+		return 0;
+
+	if (itr)
+		return itr->parse_snapshot_options(itr, opts, str);
+
+	pr_err("No AUX area tracing to snapshot\n");
+	return -EINVAL;
+}
+
+struct auxtrace_record *__weak
+auxtrace_record__init(struct perf_evlist *evlist __maybe_unused, int *err)
+{
+	*err = 0;
+	return NULL;
+}
+
+static int auxtrace_index__alloc(struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index;
+
+	auxtrace_index = malloc(sizeof(struct auxtrace_index));
+	if (!auxtrace_index)
+		return -ENOMEM;
+
+	auxtrace_index->nr = 0;
+	INIT_LIST_HEAD(&auxtrace_index->list);
+
+	list_add_tail(&auxtrace_index->list, head);
+
+	return 0;
+}
+
+void auxtrace_index__free(struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index, *n;
+
+	list_for_each_entry_safe(auxtrace_index, n, head, list) {
+		list_del(&auxtrace_index->list);
+		free(auxtrace_index);
+	}
+}
+
+static struct auxtrace_index *auxtrace_index__last(struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index;
+	int err;
+
+	if (list_empty(head)) {
+		err = auxtrace_index__alloc(head);
+		if (err)
+			return NULL;
+	}
+
+	auxtrace_index = list_entry(head->prev, struct auxtrace_index, list);
+
+	if (auxtrace_index->nr >= PERF_AUXTRACE_INDEX_ENTRY_COUNT) {
+		err = auxtrace_index__alloc(head);
+		if (err)
+			return NULL;
+		auxtrace_index = list_entry(head->prev, struct auxtrace_index,
+					    list);
+	}
+
+	return auxtrace_index;
+}
+
+int auxtrace_index__auxtrace_event(struct list_head *head,
+				   union perf_event *event, off_t file_offset)
+{
+	struct auxtrace_index *auxtrace_index;
+	size_t nr;
+
+	auxtrace_index = auxtrace_index__last(head);
+	if (!auxtrace_index)
+		return -ENOMEM;
+
+	nr = auxtrace_index->nr;
+	auxtrace_index->entries[nr].file_offset = file_offset;
+	auxtrace_index->entries[nr].sz = event->header.size;
+	auxtrace_index->nr += 1;
+
+	return 0;
+}
+
+static int auxtrace_index__do_write(int fd,
+				    struct auxtrace_index *auxtrace_index)
+{
+	struct auxtrace_index_entry ent;
+	size_t i;
+
+	for (i = 0; i < auxtrace_index->nr; i++) {
+		ent.file_offset = auxtrace_index->entries[i].file_offset;
+		ent.sz = auxtrace_index->entries[i].sz;
+		if (writen(fd, &ent, sizeof(ent)) != sizeof(ent))
+			return -errno;
+	}
+	return 0;
+}
+
+int auxtrace_index__write(int fd, struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index;
+	u64 total = 0;
+	int err;
+
+	list_for_each_entry(auxtrace_index, head, list)
+		total += auxtrace_index->nr;
+
+	if (writen(fd, &total, sizeof(total)) != sizeof(total))
+		return -errno;
+
+	list_for_each_entry(auxtrace_index, head, list) {
+		err = auxtrace_index__do_write(fd, auxtrace_index);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int auxtrace_index__process_entry(int fd, struct list_head *head,
+					 bool needs_swap)
+{
+	struct auxtrace_index *auxtrace_index;
+	struct auxtrace_index_entry ent;
+	size_t nr;
+
+	if (readn(fd, &ent, sizeof(ent)) != sizeof(ent))
+		return -1;
+
+	auxtrace_index = auxtrace_index__last(head);
+	if (!auxtrace_index)
+		return -1;
+
+	nr = auxtrace_index->nr;
+	if (needs_swap) {
+		auxtrace_index->entries[nr].file_offset =
+						bswap_64(ent.file_offset);
+		auxtrace_index->entries[nr].sz = bswap_64(ent.sz);
+	} else {
+		auxtrace_index->entries[nr].file_offset = ent.file_offset;
+		auxtrace_index->entries[nr].sz = ent.sz;
+	}
+
+	auxtrace_index->nr = nr + 1;
+
+	return 0;
+}
+
+int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
+			    bool needs_swap)
+{
+	struct list_head *head = &session->auxtrace_index;
+	u64 nr;
+
+	if (readn(fd, &nr, sizeof(u64)) != sizeof(u64))
+		return -1;
+
+	if (needs_swap)
+		nr = bswap_64(nr);
+
+	if (sizeof(u64) + nr * sizeof(struct auxtrace_index_entry) > size)
+		return -1;
+
+	while (nr--) {
+		int err;
+
+		err = auxtrace_index__process_entry(fd, head, needs_swap);
+		if (err)
+			return -1;
+	}
+
+	return 0;
+}
+
+static int auxtrace_queues__process_index_entry(struct auxtrace_queues *queues,
+						struct perf_session *session,
+						struct auxtrace_index_entry *ent)
+{
+	return auxtrace_queues__add_indexed_event(queues, session,
+						  ent->file_offset, ent->sz);
+}
+
+int auxtrace_queues__process_index(struct auxtrace_queues *queues,
+				   struct perf_session *session)
+{
+	struct auxtrace_index *auxtrace_index;
+	struct auxtrace_index_entry *ent;
+	size_t i;
+	int err;
+
+	list_for_each_entry(auxtrace_index, &session->auxtrace_index, list) {
+		for (i = 0; i < auxtrace_index->nr; i++) {
+			ent = &auxtrace_index->entries[i];
+			err = auxtrace_queues__process_index_entry(queues,
+								   session,
+								   ent);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
+					      struct auxtrace_buffer *buffer)
+{
+	if (buffer) {
+		if (list_is_last(&buffer->list, &queue->head))
+			return NULL;
+		return list_entry(buffer->list.next, struct auxtrace_buffer,
+				  list);
+	} else {
+		if (list_empty(&queue->head))
+			return NULL;
+		return list_entry(queue->head.next, struct auxtrace_buffer,
+				  list);
+	}
+}
+
+void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd)
+{
+	size_t adj = buffer->data_offset & (page_size - 1);
+	size_t size = buffer->size + adj;
+	off_t file_offset = buffer->data_offset - adj;
+	void *addr;
+
+	if (buffer->data)
+		return buffer->data;
+
+	addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset);
+	if (addr == MAP_FAILED)
+		return NULL;
+
+	buffer->mmap_addr = addr;
+	buffer->mmap_size = size;
+
+	buffer->data = addr + adj;
+
+	return buffer->data;
+}
+
+void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer)
+{
+	if (!buffer->data || !buffer->mmap_addr)
+		return;
+	munmap(buffer->mmap_addr, buffer->mmap_size);
+	buffer->mmap_addr = NULL;
+	buffer->mmap_size = 0;
+	buffer->data = NULL;
+	buffer->use_data = NULL;
+}
+
+void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer)
+{
+	auxtrace_buffer__put_data(buffer);
+	if (buffer->data_needs_freeing) {
+		buffer->data_needs_freeing = false;
+		zfree(&buffer->data);
+		buffer->use_data = NULL;
+		buffer->size = 0;
+	}
+}
+
+void auxtrace_buffer__free(struct auxtrace_buffer *buffer)
+{
+	auxtrace_buffer__drop_data(buffer);
+	free(buffer);
+}
+
+void auxtrace_synth_error(struct auxtrace_error_event *auxtrace_error, int type,
+			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+			  const char *msg)
+{
+	size_t size;
+
+	memset(auxtrace_error, 0, sizeof(struct auxtrace_error_event));
+
+	auxtrace_error->header.type = PERF_RECORD_AUXTRACE_ERROR;
+	auxtrace_error->type = type;
+	auxtrace_error->code = code;
+	auxtrace_error->cpu = cpu;
+	auxtrace_error->pid = pid;
+	auxtrace_error->tid = tid;
+	auxtrace_error->ip = ip;
+	strlcpy(auxtrace_error->msg, msg, MAX_AUXTRACE_ERROR_MSG);
+
+	size = (void *)auxtrace_error->msg - (void *)auxtrace_error +
+	       strlen(auxtrace_error->msg) + 1;
+	auxtrace_error->header.size = PERF_ALIGN(size, sizeof(u64));
+}
+
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
+					 struct perf_tool *tool,
+					 struct perf_session *session,
+					 perf_event__handler_t process)
+{
+	union perf_event *ev;
+	size_t priv_size;
+	int err;
+
+	pr_debug2("Synthesizing auxtrace information\n");
+	priv_size = auxtrace_record__info_priv_size(itr);
+	ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->auxtrace_info.header.type = PERF_RECORD_AUXTRACE_INFO;
+	ev->auxtrace_info.header.size = sizeof(struct auxtrace_info_event) +
+					priv_size;
+	err = auxtrace_record__info_fill(itr, session, &ev->auxtrace_info,
+					 priv_size);
+	if (err)
+		goto out_free;
+
+	err = process(tool, ev, NULL, NULL);
+out_free:
+	free(ev);
+	return err;
+}
+
+static bool auxtrace__dont_decode(struct perf_session *session)
+{
+	return !session->itrace_synth_opts ||
+	       session->itrace_synth_opts->dont_decode;
+}
+
+int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused,
+				      union perf_event *event,
+				      struct perf_session *session __maybe_unused)
+{
+	enum auxtrace_type type = event->auxtrace_info.type;
+
+	if (dump_trace)
+		fprintf(stdout, " type: %u\n", type);
+
+	switch (type) {
+	case PERF_AUXTRACE_UNKNOWN:
+	default:
+		return -EINVAL;
+	}
+}
+
+s64 perf_event__process_auxtrace(struct perf_tool *tool,
+				 union perf_event *event,
+				 struct perf_session *session)
+{
+	s64 err;
+
+	if (dump_trace)
+		fprintf(stdout, " size: %#"PRIx64"  offset: %#"PRIx64"  ref: %#"PRIx64"  idx: %u  tid: %d  cpu: %d\n",
+			event->auxtrace.size, event->auxtrace.offset,
+			event->auxtrace.reference, event->auxtrace.idx,
+			event->auxtrace.tid, event->auxtrace.cpu);
+
+	if (auxtrace__dont_decode(session))
+		return event->auxtrace.size;
+
+	if (!session->auxtrace || event->header.type != PERF_RECORD_AUXTRACE)
+		return -EINVAL;
+
+	err = session->auxtrace->process_auxtrace_event(session, event, tool);
+	if (err < 0)
+		return err;
+
+	return event->auxtrace.size;
+}
+
+#define PERF_ITRACE_DEFAULT_PERIOD_TYPE		PERF_ITRACE_PERIOD_NANOSECS
+#define PERF_ITRACE_DEFAULT_PERIOD		100000
+#define PERF_ITRACE_DEFAULT_CALLCHAIN_SZ	16
+#define PERF_ITRACE_MAX_CALLCHAIN_SZ		1024
+
+void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
+{
+	synth_opts->instructions = true;
+	synth_opts->branches = true;
+	synth_opts->transactions = true;
+	synth_opts->errors = true;
+	synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE;
+	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
+	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
+}
+
+/*
+ * Please check tools/perf/Documentation/perf-script.txt for information
+ * about the options parsed here, which is introduced after this cset,
+ * when support in 'perf script' for these options is introduced.
+ */
+int itrace_parse_synth_opts(const struct option *opt, const char *str,
+			    int unset)
+{
+	struct itrace_synth_opts *synth_opts = opt->value;
+	const char *p;
+	char *endptr;
+
+	synth_opts->set = true;
+
+	if (unset) {
+		synth_opts->dont_decode = true;
+		return 0;
+	}
+
+	if (!str) {
+		itrace_synth_opts__set_default(synth_opts);
+		return 0;
+	}
+
+	for (p = str; *p;) {
+		switch (*p++) {
+		case 'i':
+			synth_opts->instructions = true;
+			while (*p == ' ' || *p == ',')
+				p += 1;
+			if (isdigit(*p)) {
+				synth_opts->period = strtoull(p, &endptr, 10);
+				p = endptr;
+				while (*p == ' ' || *p == ',')
+					p += 1;
+				switch (*p++) {
+				case 'i':
+					synth_opts->period_type =
+						PERF_ITRACE_PERIOD_INSTRUCTIONS;
+					break;
+				case 't':
+					synth_opts->period_type =
+						PERF_ITRACE_PERIOD_TICKS;
+					break;
+				case 'm':
+					synth_opts->period *= 1000;
+					/* Fall through */
+				case 'u':
+					synth_opts->period *= 1000;
+					/* Fall through */
+				case 'n':
+					if (*p++ != 's')
+						goto out_err;
+					synth_opts->period_type =
+						PERF_ITRACE_PERIOD_NANOSECS;
+					break;
+				case '\0':
+					goto out;
+				default:
+					goto out_err;
+				}
+			}
+			break;
+		case 'b':
+			synth_opts->branches = true;
+			break;
+		case 'x':
+			synth_opts->transactions = true;
+			break;
+		case 'e':
+			synth_opts->errors = true;
+			break;
+		case 'd':
+			synth_opts->log = true;
+			break;
+		case 'c':
+			synth_opts->branches = true;
+			synth_opts->calls = true;
+			break;
+		case 'r':
+			synth_opts->branches = true;
+			synth_opts->returns = true;
+			break;
+		case 'g':
+			synth_opts->callchain = true;
+			synth_opts->callchain_sz =
+					PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
+			while (*p == ' ' || *p == ',')
+				p += 1;
+			if (isdigit(*p)) {
+				unsigned int val;
+
+				val = strtoul(p, &endptr, 10);
+				p = endptr;
+				if (!val || val > PERF_ITRACE_MAX_CALLCHAIN_SZ)
+					goto out_err;
+				synth_opts->callchain_sz = val;
+			}
+			break;
+		case ' ':
+		case ',':
+			break;
+		default:
+			goto out_err;
+		}
+	}
+out:
+	if (synth_opts->instructions) {
+		if (!synth_opts->period_type)
+			synth_opts->period_type =
+					PERF_ITRACE_DEFAULT_PERIOD_TYPE;
+		if (!synth_opts->period)
+			synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
+	}
+
+	return 0;
+
+out_err:
+	pr_err("Bad Instruction Tracing options '%s'\n", str);
+	return -EINVAL;
+}
+
+static const char * const auxtrace_error_type_name[] = {
+	[PERF_AUXTRACE_ERROR_ITRACE] = "instruction trace",
+};
+
+static const char *auxtrace_error_name(int type)
+{
+	const char *error_type_name = NULL;
+
+	if (type < PERF_AUXTRACE_ERROR_MAX)
+		error_type_name = auxtrace_error_type_name[type];
+	if (!error_type_name)
+		error_type_name = "unknown AUX";
+	return error_type_name;
+}
+
+size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp)
+{
+	struct auxtrace_error_event *e = &event->auxtrace_error;
+	int ret;
+
+	ret = fprintf(fp, " %s error type %u",
+		      auxtrace_error_name(e->type), e->type);
+	ret += fprintf(fp, " cpu %d pid %d tid %d ip %#"PRIx64" code %u: %s\n",
+		       e->cpu, e->pid, e->tid, e->ip, e->code, e->msg);
+	return ret;
+}
+
+void perf_session__auxtrace_error_inc(struct perf_session *session,
+				      union perf_event *event)
+{
+	struct auxtrace_error_event *e = &event->auxtrace_error;
+
+	if (e->type < PERF_AUXTRACE_ERROR_MAX)
+		session->evlist->stats.nr_auxtrace_errors[e->type] += 1;
+}
+
+void events_stats__auxtrace_error_warn(const struct events_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < PERF_AUXTRACE_ERROR_MAX; i++) {
+		if (!stats->nr_auxtrace_errors[i])
+			continue;
+		ui__warning("%u %s errors\n",
+			    stats->nr_auxtrace_errors[i],
+			    auxtrace_error_name(i));
+	}
+}
+
+int perf_event__process_auxtrace_error(struct perf_tool *tool __maybe_unused,
+				       union perf_event *event,
+				       struct perf_session *session)
+{
+	if (auxtrace__dont_decode(session))
+		return 0;
+
+	perf_event__fprintf_auxtrace_error(event, stdout);
+	return 0;
+}
+
+static int __auxtrace_mmap__read(struct auxtrace_mmap *mm,
+				 struct auxtrace_record *itr,
+				 struct perf_tool *tool, process_auxtrace_t fn,
+				 bool snapshot, size_t snapshot_size)
+{
+	u64 head, old = mm->prev, offset, ref;
+	unsigned char *data = mm->base;
+	size_t size, head_off, old_off, len1, len2, padding;
+	union perf_event ev;
+	void *data1, *data2;
+
+	if (snapshot) {
+		head = auxtrace_mmap__read_snapshot_head(mm);
+		if (auxtrace_record__find_snapshot(itr, mm->idx, mm, data,
+						   &head, &old))
+			return -1;
+	} else {
+		head = auxtrace_mmap__read_head(mm);
+	}
+
+	if (old == head)
+		return 0;
+
+	pr_debug3("auxtrace idx %d old %#"PRIx64" head %#"PRIx64" diff %#"PRIx64"\n",
+		  mm->idx, old, head, head - old);
+
+	if (mm->mask) {
+		head_off = head & mm->mask;
+		old_off = old & mm->mask;
+	} else {
+		head_off = head % mm->len;
+		old_off = old % mm->len;
+	}
+
+	if (head_off > old_off)
+		size = head_off - old_off;
+	else
+		size = mm->len - (old_off - head_off);
+
+	if (snapshot && size > snapshot_size)
+		size = snapshot_size;
+
+	ref = auxtrace_record__reference(itr);
+
+	if (head > old || size <= head || mm->mask) {
+		offset = head - size;
+	} else {
+		/*
+		 * When the buffer size is not a power of 2, 'head' wraps at the
+		 * highest multiple of the buffer size, so we have to subtract
+		 * the remainder here.
+		 */
+		u64 rem = (0ULL - mm->len) % mm->len;
+
+		offset = head - size - rem;
+	}
+
+	if (size > head_off) {
+		len1 = size - head_off;
+		data1 = &data[mm->len - len1];
+		len2 = head_off;
+		data2 = &data[0];
+	} else {
+		len1 = size;
+		data1 = &data[head_off - len1];
+		len2 = 0;
+		data2 = NULL;
+	}
+
+	/* padding must be written by fn() e.g. record__process_auxtrace() */
+	padding = size & 7;
+	if (padding)
+		padding = 8 - padding;
+
+	memset(&ev, 0, sizeof(ev));
+	ev.auxtrace.header.type = PERF_RECORD_AUXTRACE;
+	ev.auxtrace.header.size = sizeof(ev.auxtrace);
+	ev.auxtrace.size = size + padding;
+	ev.auxtrace.offset = offset;
+	ev.auxtrace.reference = ref;
+	ev.auxtrace.idx = mm->idx;
+	ev.auxtrace.tid = mm->tid;
+	ev.auxtrace.cpu = mm->cpu;
+
+	if (fn(tool, &ev, data1, len1, data2, len2))
+		return -1;
+
+	mm->prev = head;
+
+	if (!snapshot) {
+		auxtrace_mmap__write_tail(mm, head);
+		if (itr->read_finish) {
+			int err;
+
+			err = itr->read_finish(itr, mm->idx);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 1;
+}
+
+int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr,
+			struct perf_tool *tool, process_auxtrace_t fn)
+{
+	return __auxtrace_mmap__read(mm, itr, tool, fn, false, 0);
+}
+
+int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm,
+				 struct auxtrace_record *itr,
+				 struct perf_tool *tool, process_auxtrace_t fn,
+				 size_t snapshot_size)
+{
+	return __auxtrace_mmap__read(mm, itr, tool, fn, true, snapshot_size);
+}
+
+/**
+ * struct auxtrace_cache - hash table to implement a cache
+ * @hashtable: the hashtable
+ * @sz: hashtable size (number of hlists)
+ * @entry_size: size of an entry
+ * @limit: limit the number of entries to this maximum, when reached the cache
+ *         is dropped and caching begins again with an empty cache
+ * @cnt: current number of entries
+ * @bits: hashtable size (@sz = 2^@bits)
+ */
+struct auxtrace_cache {
+	struct hlist_head *hashtable;
+	size_t sz;
+	size_t entry_size;
+	size_t limit;
+	size_t cnt;
+	unsigned int bits;
+};
+
+struct auxtrace_cache *auxtrace_cache__new(unsigned int bits, size_t entry_size,
+					   unsigned int limit_percent)
+{
+	struct auxtrace_cache *c;
+	struct hlist_head *ht;
+	size_t sz, i;
+
+	c = zalloc(sizeof(struct auxtrace_cache));
+	if (!c)
+		return NULL;
+
+	sz = 1UL << bits;
+
+	ht = calloc(sz, sizeof(struct hlist_head));
+	if (!ht)
+		goto out_free;
+
+	for (i = 0; i < sz; i++)
+		INIT_HLIST_HEAD(&ht[i]);
+
+	c->hashtable = ht;
+	c->sz = sz;
+	c->entry_size = entry_size;
+	c->limit = (c->sz * limit_percent) / 100;
+	c->bits = bits;
+
+	return c;
+
+out_free:
+	free(c);
+	return NULL;
+}
+
+static void auxtrace_cache__drop(struct auxtrace_cache *c)
+{
+	struct auxtrace_cache_entry *entry;
+	struct hlist_node *tmp;
+	size_t i;
+
+	if (!c)
+		return;
+
+	for (i = 0; i < c->sz; i++) {
+		hlist_for_each_entry_safe(entry, tmp, &c->hashtable[i], hash) {
+			hlist_del(&entry->hash);
+			auxtrace_cache__free_entry(c, entry);
+		}
+	}
+
+	c->cnt = 0;
+}
+
+void auxtrace_cache__free(struct auxtrace_cache *c)
+{
+	if (!c)
+		return;
+
+	auxtrace_cache__drop(c);
+	free(c->hashtable);
+	free(c);
+}
+
+void *auxtrace_cache__alloc_entry(struct auxtrace_cache *c)
+{
+	return malloc(c->entry_size);
+}
+
+void auxtrace_cache__free_entry(struct auxtrace_cache *c __maybe_unused,
+				void *entry)
+{
+	free(entry);
+}
+
+int auxtrace_cache__add(struct auxtrace_cache *c, u32 key,
+			struct auxtrace_cache_entry *entry)
+{
+	if (c->limit && ++c->cnt > c->limit)
+		auxtrace_cache__drop(c);
+
+	entry->key = key;
+	hlist_add_head(&entry->hash, &c->hashtable[hash_32(key, c->bits)]);
+
+	return 0;
+}
+
+void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key)
+{
+	struct auxtrace_cache_entry *entry;
+	struct hlist_head *hlist;
+
+	if (!c)
+		return NULL;
+
+	hlist = &c->hashtable[hash_32(key, c->bits)];
+	hlist_for_each_entry(entry, hlist, hash) {
+		if (entry->key == key)
+			return entry;
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
new file mode 100644
index 000000000000..a171abbe7301
--- /dev/null
+++ b/tools/perf/util/auxtrace.h
@@ -0,0 +1,643 @@
+/*
+ * auxtrace.h: AUX area trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __PERF_AUXTRACE_H
+#define __PERF_AUXTRACE_H
+
+#include <sys/types.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf.h"
+#include "event.h"
+#include "session.h"
+#include "debug.h"
+
+union perf_event;
+struct perf_session;
+struct perf_evlist;
+struct perf_tool;
+struct option;
+struct record_opts;
+struct auxtrace_info_event;
+struct events_stats;
+
+enum auxtrace_type {
+	PERF_AUXTRACE_UNKNOWN,
+};
+
+enum itrace_period_type {
+	PERF_ITRACE_PERIOD_INSTRUCTIONS,
+	PERF_ITRACE_PERIOD_TICKS,
+	PERF_ITRACE_PERIOD_NANOSECS,
+};
+
+/**
+ * struct itrace_synth_opts - AUX area tracing synthesis options.
+ * @set: indicates whether or not options have been set
+ * @inject: indicates the event (not just the sample) must be fully synthesized
+ *          because 'perf inject' will write it out
+ * @instructions: whether to synthesize 'instructions' events
+ * @branches: whether to synthesize 'branches' events
+ * @transactions: whether to synthesize events for transactions
+ * @errors: whether to synthesize decoder error events
+ * @dont_decode: whether to skip decoding entirely
+ * @log: write a decoding log
+ * @calls: limit branch samples to calls (can be combined with @returns)
+ * @returns: limit branch samples to returns (can be combined with @calls)
+ * @callchain: add callchain to 'instructions' events
+ * @callchain_sz: maximum callchain size
+ * @period: 'instructions' events period
+ * @period_type: 'instructions' events period type
+ */
+struct itrace_synth_opts {
+	bool			set;
+	bool			inject;
+	bool			instructions;
+	bool			branches;
+	bool			transactions;
+	bool			errors;
+	bool			dont_decode;
+	bool			log;
+	bool			calls;
+	bool			returns;
+	bool			callchain;
+	unsigned int		callchain_sz;
+	unsigned long long	period;
+	enum itrace_period_type	period_type;
+};
+
+/**
+ * struct auxtrace_index_entry - indexes a AUX area tracing event within a
+ *                               perf.data file.
+ * @file_offset: offset within the perf.data file
+ * @sz: size of the event
+ */
+struct auxtrace_index_entry {
+	u64			file_offset;
+	u64			sz;
+};
+
+#define PERF_AUXTRACE_INDEX_ENTRY_COUNT 256
+
+/**
+ * struct auxtrace_index - index of AUX area tracing events within a perf.data
+ *                         file.
+ * @list: linking a number of arrays of entries
+ * @nr: number of entries
+ * @entries: array of entries
+ */
+struct auxtrace_index {
+	struct list_head	list;
+	size_t			nr;
+	struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
+};
+
+/**
+ * struct auxtrace - session callbacks to allow AUX area data decoding.
+ * @process_event: lets the decoder see all session events
+ * @flush_events: process any remaining data
+ * @free_events: free resources associated with event processing
+ * @free: free resources associated with the session
+ */
+struct auxtrace {
+	int (*process_event)(struct perf_session *session,
+			     union perf_event *event,
+			     struct perf_sample *sample,
+			     struct perf_tool *tool);
+	int (*process_auxtrace_event)(struct perf_session *session,
+				      union perf_event *event,
+				      struct perf_tool *tool);
+	int (*flush_events)(struct perf_session *session,
+			    struct perf_tool *tool);
+	void (*free_events)(struct perf_session *session);
+	void (*free)(struct perf_session *session);
+};
+
+/**
+ * struct auxtrace_buffer - a buffer containing AUX area tracing data.
+ * @list: buffers are queued in a list held by struct auxtrace_queue
+ * @size: size of the buffer in bytes
+ * @pid: in per-thread mode, the pid this buffer is associated with
+ * @tid: in per-thread mode, the tid this buffer is associated with
+ * @cpu: in per-cpu mode, the cpu this buffer is associated with
+ * @data: actual buffer data (can be null if the data has not been loaded)
+ * @data_offset: file offset at which the buffer can be read
+ * @mmap_addr: mmap address at which the buffer can be read
+ * @mmap_size: size of the mmap at @mmap_addr
+ * @data_needs_freeing: @data was malloc'd so free it when it is no longer
+ *                      needed
+ * @consecutive: the original data was split up and this buffer is consecutive
+ *               to the previous buffer
+ * @offset: offset as determined by aux_head / aux_tail members of struct
+ *          perf_event_mmap_page
+ * @reference: an implementation-specific reference determined when the data is
+ *             recorded
+ * @buffer_nr: used to number each buffer
+ * @use_size: implementation actually only uses this number of bytes
+ * @use_data: implementation actually only uses data starting at this address
+ */
+struct auxtrace_buffer {
+	struct list_head	list;
+	size_t			size;
+	pid_t			pid;
+	pid_t			tid;
+	int			cpu;
+	void			*data;
+	off_t			data_offset;
+	void			*mmap_addr;
+	size_t			mmap_size;
+	bool			data_needs_freeing;
+	bool			consecutive;
+	u64			offset;
+	u64			reference;
+	u64			buffer_nr;
+	size_t			use_size;
+	void			*use_data;
+};
+
+/**
+ * struct auxtrace_queue - a queue of AUX area tracing data buffers.
+ * @head: head of buffer list
+ * @tid: in per-thread mode, the tid this queue is associated with
+ * @cpu: in per-cpu mode, the cpu this queue is associated with
+ * @set: %true once this queue has been dedicated to a specific thread or cpu
+ * @priv: implementation-specific data
+ */
+struct auxtrace_queue {
+	struct list_head	head;
+	pid_t			tid;
+	int			cpu;
+	bool			set;
+	void			*priv;
+};
+
+/**
+ * struct auxtrace_queues - an array of AUX area tracing queues.
+ * @queue_array: array of queues
+ * @nr_queues: number of queues
+ * @new_data: set whenever new data is queued
+ * @populated: queues have been fully populated using the auxtrace_index
+ * @next_buffer_nr: used to number each buffer
+ */
+struct auxtrace_queues {
+	struct auxtrace_queue	*queue_array;
+	unsigned int		nr_queues;
+	bool			new_data;
+	bool			populated;
+	u64			next_buffer_nr;
+};
+
+/**
+ * struct auxtrace_heap_item - element of struct auxtrace_heap.
+ * @queue_nr: queue number
+ * @ordinal: value used for sorting (lowest ordinal is top of the heap) expected
+ *           to be a timestamp
+ */
+struct auxtrace_heap_item {
+	unsigned int		queue_nr;
+	u64			ordinal;
+};
+
+/**
+ * struct auxtrace_heap - a heap suitable for sorting AUX area tracing queues.
+ * @heap_array: the heap
+ * @heap_cnt: the number of elements in the heap
+ * @heap_sz: maximum number of elements (grows as needed)
+ */
+struct auxtrace_heap {
+	struct auxtrace_heap_item	*heap_array;
+	unsigned int		heap_cnt;
+	unsigned int		heap_sz;
+};
+
+/**
+ * struct auxtrace_mmap - records an mmap of the auxtrace buffer.
+ * @base: address of mapped area
+ * @userpg: pointer to buffer's perf_event_mmap_page
+ * @mask: %0 if @len is not a power of two, otherwise (@len - %1)
+ * @len: size of mapped area
+ * @prev: previous aux_head
+ * @idx: index of this mmap
+ * @tid: tid for a per-thread mmap (also set if there is only 1 tid on a per-cpu
+ *       mmap) otherwise %0
+ * @cpu: cpu number for a per-cpu mmap otherwise %-1
+ */
+struct auxtrace_mmap {
+	void		*base;
+	void		*userpg;
+	size_t		mask;
+	size_t		len;
+	u64		prev;
+	int		idx;
+	pid_t		tid;
+	int		cpu;
+};
+
+/**
+ * struct auxtrace_mmap_params - parameters to set up struct auxtrace_mmap.
+ * @mask: %0 if @len is not a power of two, otherwise (@len - %1)
+ * @offset: file offset of mapped area
+ * @len: size of mapped area
+ * @prot: mmap memory protection
+ * @idx: index of this mmap
+ * @tid: tid for a per-thread mmap (also set if there is only 1 tid on a per-cpu
+ *       mmap) otherwise %0
+ * @cpu: cpu number for a per-cpu mmap otherwise %-1
+ */
+struct auxtrace_mmap_params {
+	size_t		mask;
+	off_t		offset;
+	size_t		len;
+	int		prot;
+	int		idx;
+	pid_t		tid;
+	int		cpu;
+};
+
+/**
+ * struct auxtrace_record - callbacks for recording AUX area data.
+ * @recording_options: validate and process recording options
+ * @info_priv_size: return the size of the private data in auxtrace_info_event
+ * @info_fill: fill-in the private data in auxtrace_info_event
+ * @free: free this auxtrace record structure
+ * @snapshot_start: starting a snapshot
+ * @snapshot_finish: finishing a snapshot
+ * @find_snapshot: find data to snapshot within auxtrace mmap
+ * @parse_snapshot_options: parse snapshot options
+ * @reference: provide a 64-bit reference number for auxtrace_event
+ * @read_finish: called after reading from an auxtrace mmap
+ */
+struct auxtrace_record {
+	int (*recording_options)(struct auxtrace_record *itr,
+				 struct perf_evlist *evlist,
+				 struct record_opts *opts);
+	size_t (*info_priv_size)(struct auxtrace_record *itr);
+	int (*info_fill)(struct auxtrace_record *itr,
+			 struct perf_session *session,
+			 struct auxtrace_info_event *auxtrace_info,
+			 size_t priv_size);
+	void (*free)(struct auxtrace_record *itr);
+	int (*snapshot_start)(struct auxtrace_record *itr);
+	int (*snapshot_finish)(struct auxtrace_record *itr);
+	int (*find_snapshot)(struct auxtrace_record *itr, int idx,
+			     struct auxtrace_mmap *mm, unsigned char *data,
+			     u64 *head, u64 *old);
+	int (*parse_snapshot_options)(struct auxtrace_record *itr,
+				      struct record_opts *opts,
+				      const char *str);
+	u64 (*reference)(struct auxtrace_record *itr);
+	int (*read_finish)(struct auxtrace_record *itr, int idx);
+};
+
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+/*
+ * In snapshot mode the mmapped page is read-only which makes using
+ * __sync_val_compare_and_swap() problematic.  However, snapshot mode expects
+ * the buffer is not updated while the snapshot is made (e.g. Intel PT disables
+ * the event) so there is not a race anyway.
+ */
+static inline u64 auxtrace_mmap__read_snapshot_head(struct auxtrace_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->userpg;
+	u64 head = ACCESS_ONCE(pc->aux_head);
+
+	/* Ensure all reads are done after we read the head */
+	rmb();
+	return head;
+}
+
+static inline u64 auxtrace_mmap__read_head(struct auxtrace_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->userpg;
+#if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	u64 head = ACCESS_ONCE(pc->aux_head);
+#else
+	u64 head = __sync_val_compare_and_swap(&pc->aux_head, 0, 0);
+#endif
+
+	/* Ensure all reads are done after we read the head */
+	rmb();
+	return head;
+}
+
+static inline void auxtrace_mmap__write_tail(struct auxtrace_mmap *mm, u64 tail)
+{
+	struct perf_event_mmap_page *pc = mm->userpg;
+#if BITS_PER_LONG != 64 && defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	u64 old_tail;
+#endif
+
+	/* Ensure all reads are done before we write the tail out */
+	mb();
+#if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	pc->aux_tail = tail;
+#else
+	do {
+		old_tail = __sync_val_compare_and_swap(&pc->aux_tail, 0, 0);
+	} while (!__sync_bool_compare_and_swap(&pc->aux_tail, old_tail, tail));
+#endif
+}
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+			struct auxtrace_mmap_params *mp,
+			void *userpg, int fd);
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm);
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+				off_t auxtrace_offset,
+				unsigned int auxtrace_pages,
+				bool auxtrace_overwrite);
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+				   struct perf_evlist *evlist, int idx,
+				   bool per_cpu);
+
+typedef int (*process_auxtrace_t)(struct perf_tool *tool,
+				  union perf_event *event, void *data1,
+				  size_t len1, void *data2, size_t len2);
+
+int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr,
+			struct perf_tool *tool, process_auxtrace_t fn);
+
+int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm,
+				 struct auxtrace_record *itr,
+				 struct perf_tool *tool, process_auxtrace_t fn,
+				 size_t snapshot_size);
+
+int auxtrace_queues__init(struct auxtrace_queues *queues);
+int auxtrace_queues__add_event(struct auxtrace_queues *queues,
+			       struct perf_session *session,
+			       union perf_event *event, off_t data_offset,
+			       struct auxtrace_buffer **buffer_ptr);
+void auxtrace_queues__free(struct auxtrace_queues *queues);
+int auxtrace_queues__process_index(struct auxtrace_queues *queues,
+				   struct perf_session *session);
+struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
+					      struct auxtrace_buffer *buffer);
+void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd);
+void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer);
+void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer);
+void auxtrace_buffer__free(struct auxtrace_buffer *buffer);
+
+int auxtrace_heap__add(struct auxtrace_heap *heap, unsigned int queue_nr,
+		       u64 ordinal);
+void auxtrace_heap__pop(struct auxtrace_heap *heap);
+void auxtrace_heap__free(struct auxtrace_heap *heap);
+
+struct auxtrace_cache_entry {
+	struct hlist_node hash;
+	u32 key;
+};
+
+struct auxtrace_cache *auxtrace_cache__new(unsigned int bits, size_t entry_size,
+					   unsigned int limit_percent);
+void auxtrace_cache__free(struct auxtrace_cache *auxtrace_cache);
+void *auxtrace_cache__alloc_entry(struct auxtrace_cache *c);
+void auxtrace_cache__free_entry(struct auxtrace_cache *c, void *entry);
+int auxtrace_cache__add(struct auxtrace_cache *c, u32 key,
+			struct auxtrace_cache_entry *entry);
+void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key);
+
+struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist,
+					      int *err);
+
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
+				    struct record_opts *opts,
+				    const char *str);
+int auxtrace_record__options(struct auxtrace_record *itr,
+			     struct perf_evlist *evlist,
+			     struct record_opts *opts);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+int auxtrace_record__info_fill(struct auxtrace_record *itr,
+			       struct perf_session *session,
+			       struct auxtrace_info_event *auxtrace_info,
+			       size_t priv_size);
+void auxtrace_record__free(struct auxtrace_record *itr);
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr);
+int auxtrace_record__snapshot_finish(struct auxtrace_record *itr);
+int auxtrace_record__find_snapshot(struct auxtrace_record *itr, int idx,
+				   struct auxtrace_mmap *mm,
+				   unsigned char *data, u64 *head, u64 *old);
+u64 auxtrace_record__reference(struct auxtrace_record *itr);
+
+int auxtrace_index__auxtrace_event(struct list_head *head, union perf_event *event,
+				   off_t file_offset);
+int auxtrace_index__write(int fd, struct list_head *head);
+int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
+			    bool needs_swap);
+void auxtrace_index__free(struct list_head *head);
+
+void auxtrace_synth_error(struct auxtrace_error_event *auxtrace_error, int type,
+			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+			  const char *msg);
+
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
+					 struct perf_tool *tool,
+					 struct perf_session *session,
+					 perf_event__handler_t process);
+int perf_event__process_auxtrace_info(struct perf_tool *tool,
+				      union perf_event *event,
+				      struct perf_session *session);
+s64 perf_event__process_auxtrace(struct perf_tool *tool,
+				 union perf_event *event,
+				 struct perf_session *session);
+int perf_event__process_auxtrace_error(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_session *session);
+int itrace_parse_synth_opts(const struct option *opt, const char *str,
+			    int unset);
+void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts);
+
+size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp);
+void perf_session__auxtrace_error_inc(struct perf_session *session,
+				      union perf_event *event);
+void events_stats__auxtrace_error_warn(const struct events_stats *stats);
+
+static inline int auxtrace__process_event(struct perf_session *session,
+					  union perf_event *event,
+					  struct perf_sample *sample,
+					  struct perf_tool *tool)
+{
+	if (!session->auxtrace)
+		return 0;
+
+	return session->auxtrace->process_event(session, event, sample, tool);
+}
+
+static inline int auxtrace__flush_events(struct perf_session *session,
+					 struct perf_tool *tool)
+{
+	if (!session->auxtrace)
+		return 0;
+
+	return session->auxtrace->flush_events(session, tool);
+}
+
+static inline void auxtrace__free_events(struct perf_session *session)
+{
+	if (!session->auxtrace)
+		return;
+
+	return session->auxtrace->free_events(session);
+}
+
+static inline void auxtrace__free(struct perf_session *session)
+{
+	if (!session->auxtrace)
+		return;
+
+	return session->auxtrace->free(session);
+}
+
+#else
+
+static inline struct auxtrace_record *
+auxtrace_record__init(struct perf_evlist *evlist __maybe_unused,
+		      int *err __maybe_unused)
+{
+	*err = 0;
+	return NULL;
+}
+
+static inline
+void auxtrace_record__free(struct auxtrace_record *itr __maybe_unused)
+{
+}
+
+static inline int
+perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused,
+				     struct perf_tool *tool __maybe_unused,
+				     struct perf_session *session __maybe_unused,
+				     perf_event__handler_t process __maybe_unused)
+{
+	return -EINVAL;
+}
+
+static inline
+int auxtrace_record__options(struct auxtrace_record *itr __maybe_unused,
+			     struct perf_evlist *evlist __maybe_unused,
+			     struct record_opts *opts __maybe_unused)
+{
+	return 0;
+}
+
+#define perf_event__process_auxtrace_info		0
+#define perf_event__process_auxtrace			0
+#define perf_event__process_auxtrace_error		0
+
+static inline
+void perf_session__auxtrace_error_inc(struct perf_session *session
+				      __maybe_unused,
+				      union perf_event *event
+				      __maybe_unused)
+{
+}
+
+static inline
+void events_stats__auxtrace_error_warn(const struct events_stats *stats
+				       __maybe_unused)
+{
+}
+
+static inline
+int itrace_parse_synth_opts(const struct option *opt __maybe_unused,
+			    const char *str __maybe_unused,
+			    int unset __maybe_unused)
+{
+	pr_err("AUX area tracing not supported\n");
+	return -EINVAL;
+}
+
+static inline
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr __maybe_unused,
+				    struct record_opts *opts __maybe_unused,
+				    const char *str)
+{
+	if (!str)
+		return 0;
+	pr_err("AUX area tracing not supported\n");
+	return -EINVAL;
+}
+
+static inline
+int auxtrace__process_event(struct perf_session *session __maybe_unused,
+			    union perf_event *event __maybe_unused,
+			    struct perf_sample *sample __maybe_unused,
+			    struct perf_tool *tool __maybe_unused)
+{
+	return 0;
+}
+
+static inline
+int auxtrace__flush_events(struct perf_session *session __maybe_unused,
+			   struct perf_tool *tool __maybe_unused)
+{
+	return 0;
+}
+
+static inline
+void auxtrace__free_events(struct perf_session *session __maybe_unused)
+{
+}
+
+static inline
+void auxtrace_cache__free(struct auxtrace_cache *auxtrace_cache __maybe_unused)
+{
+}
+
+static inline
+void auxtrace__free(struct perf_session *session __maybe_unused)
+{
+}
+
+static inline
+int auxtrace_index__write(int fd __maybe_unused,
+			  struct list_head *head __maybe_unused)
+{
+	return -EINVAL;
+}
+
+static inline
+int auxtrace_index__process(int fd __maybe_unused,
+			    u64 size __maybe_unused,
+			    struct perf_session *session __maybe_unused,
+			    bool needs_swap __maybe_unused)
+{
+	return -EINVAL;
+}
+
+static inline
+void auxtrace_index__free(struct list_head *head __maybe_unused)
+{
+}
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+			struct auxtrace_mmap_params *mp,
+			void *userpg, int fd);
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm);
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+				off_t auxtrace_offset,
+				unsigned int auxtrace_pages,
+				bool auxtrace_overwrite);
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+				   struct perf_evlist *evlist, int idx,
+				   bool per_cpu);
+
+#endif
+
+#endif
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 6033a0a212ca..679c2c6d8ade 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -72,6 +72,10 @@ extern struct callchain_param callchain_param;
 struct callchain_list {
 	u64			ip;
 	struct map_symbol	ms;
+	struct /* for TUI */ {
+		bool		unfolded;
+		bool		has_children;
+	};
 	char		       *srcline;
 	struct list_head	list;
 };
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index dd17c9a32fbc..5bfc1198ab46 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -14,6 +14,7 @@
 #include <babeltrace/ctf-writer/event.h>
 #include <babeltrace/ctf-writer/event-types.h>
 #include <babeltrace/ctf-writer/event-fields.h>
+#include <babeltrace/ctf-ir/utils.h>
 #include <babeltrace/ctf/events.h>
 #include <traceevent/event-parse.h>
 #include "asm/bug.h"
@@ -38,12 +39,21 @@ struct evsel_priv {
 	struct bt_ctf_event_class *event_class;
 };
 
+#define MAX_CPUS	4096
+
+struct ctf_stream {
+	struct bt_ctf_stream *stream;
+	int cpu;
+	u32 count;
+};
+
 struct ctf_writer {
 	/* writer primitives */
-	struct bt_ctf_writer		*writer;
-	struct bt_ctf_stream		*stream;
-	struct bt_ctf_stream_class	*stream_class;
-	struct bt_ctf_clock		*clock;
+	struct bt_ctf_writer		 *writer;
+	struct ctf_stream		**stream;
+	int				  stream_cnt;
+	struct bt_ctf_stream_class	 *stream_class;
+	struct bt_ctf_clock		 *clock;
 
 	/* data types */
 	union {
@@ -65,6 +75,9 @@ struct convert {
 
 	u64			events_size;
 	u64			events_count;
+
+	/* Ordered events configured queue size. */
+	u64			queue_size;
 };
 
 static int value_set(struct bt_ctf_field_type *type,
@@ -153,6 +166,43 @@ get_tracepoint_field_type(struct ctf_writer *cw, struct format_field *field)
 		return cw->data.u32;
 }
 
+static unsigned long long adjust_signedness(unsigned long long value_int, int size)
+{
+	unsigned long long value_mask;
+
+	/*
+	 * value_mask = (1 << (size * 8 - 1)) - 1.
+	 * Directly set value_mask for code readers.
+	 */
+	switch (size) {
+	case 1:
+		value_mask = 0x7fULL;
+		break;
+	case 2:
+		value_mask = 0x7fffULL;
+		break;
+	case 4:
+		value_mask = 0x7fffffffULL;
+		break;
+	case 8:
+		/*
+		 * For 64 bit value, return it self. There is no need
+		 * to fill high bit.
+		 */
+		/* Fall through */
+	default:
+		/* BUG! */
+		return value_int;
+	}
+
+	/* If it is a positive value, don't adjust. */
+	if ((value_int & (~0ULL - value_mask)) == 0)
+		return value_int;
+
+	/* Fill upper part of value_int with 1 to make it a negative long long. */
+	return (value_int & value_mask) | ~value_mask;
+}
+
 static int add_tracepoint_field_value(struct ctf_writer *cw,
 				      struct bt_ctf_event_class *event_class,
 				      struct bt_ctf_event *event,
@@ -164,7 +214,6 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 	struct bt_ctf_field *field;
 	const char *name = fmtf->name;
 	void *data = sample->raw_data;
-	unsigned long long value_int;
 	unsigned long flags = fmtf->flags;
 	unsigned int n_items;
 	unsigned int i;
@@ -172,6 +221,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 	unsigned int len;
 	int ret;
 
+	name = fmtf->alias;
 	offset = fmtf->offset;
 	len = fmtf->size;
 	if (flags & FIELD_IS_STRING)
@@ -208,11 +258,6 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 	type = get_tracepoint_field_type(cw, fmtf);
 
 	for (i = 0; i < n_items; i++) {
-		if (!(flags & FIELD_IS_STRING))
-			value_int = pevent_read_number(
-					fmtf->event->pevent,
-					data + offset + i * len, len);
-
 		if (flags & FIELD_IS_ARRAY)
 			field = bt_ctf_field_array_get_field(array_field, i);
 		else
@@ -226,12 +271,21 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 		if (flags & FIELD_IS_STRING)
 			ret = bt_ctf_field_string_set_value(field,
 					data + offset + i * len);
-		else if (!(flags & FIELD_IS_SIGNED))
-			ret = bt_ctf_field_unsigned_integer_set_value(
-					field, value_int);
-		else
-			ret = bt_ctf_field_signed_integer_set_value(
-					field, value_int);
+		else {
+			unsigned long long value_int;
+
+			value_int = pevent_read_number(
+					fmtf->event->pevent,
+					data + offset + i * len, len);
+
+			if (!(flags & FIELD_IS_SIGNED))
+				ret = bt_ctf_field_unsigned_integer_set_value(
+						field, value_int);
+			else
+				ret = bt_ctf_field_signed_integer_set_value(
+						field, adjust_signedness(value_int, len));
+		}
+
 		if (ret) {
 			pr_err("failed to set file value %s\n", name);
 			goto err_put_field;
@@ -346,12 +400,6 @@ static int add_generic_values(struct ctf_writer *cw,
 			return -1;
 	}
 
-	if (type & PERF_SAMPLE_CPU) {
-		ret = value_set_u32(cw, event, "perf_cpu", sample->cpu);
-		if (ret)
-			return -1;
-	}
-
 	if (type & PERF_SAMPLE_PERIOD) {
 		ret = value_set_u64(cw, event, "perf_period", sample->period);
 		if (ret)
@@ -381,6 +429,129 @@ static int add_generic_values(struct ctf_writer *cw,
 	return 0;
 }
 
+static int ctf_stream__flush(struct ctf_stream *cs)
+{
+	int err = 0;
+
+	if (cs) {
+		err = bt_ctf_stream_flush(cs->stream);
+		if (err)
+			pr_err("CTF stream %d flush failed\n", cs->cpu);
+
+		pr("Flush stream for cpu %d (%u samples)\n",
+		   cs->cpu, cs->count);
+
+		cs->count = 0;
+	}
+
+	return err;
+}
+
+static struct ctf_stream *ctf_stream__create(struct ctf_writer *cw, int cpu)
+{
+	struct ctf_stream *cs;
+	struct bt_ctf_field *pkt_ctx   = NULL;
+	struct bt_ctf_field *cpu_field = NULL;
+	struct bt_ctf_stream *stream   = NULL;
+	int ret;
+
+	cs = zalloc(sizeof(*cs));
+	if (!cs) {
+		pr_err("Failed to allocate ctf stream\n");
+		return NULL;
+	}
+
+	stream = bt_ctf_writer_create_stream(cw->writer, cw->stream_class);
+	if (!stream) {
+		pr_err("Failed to create CTF stream\n");
+		goto out;
+	}
+
+	pkt_ctx = bt_ctf_stream_get_packet_context(stream);
+	if (!pkt_ctx) {
+		pr_err("Failed to obtain packet context\n");
+		goto out;
+	}
+
+	cpu_field = bt_ctf_field_structure_get_field(pkt_ctx, "cpu_id");
+	bt_ctf_field_put(pkt_ctx);
+	if (!cpu_field) {
+		pr_err("Failed to obtain cpu field\n");
+		goto out;
+	}
+
+	ret = bt_ctf_field_unsigned_integer_set_value(cpu_field, (u32) cpu);
+	if (ret) {
+		pr_err("Failed to update CPU number\n");
+		goto out;
+	}
+
+	bt_ctf_field_put(cpu_field);
+
+	cs->cpu    = cpu;
+	cs->stream = stream;
+	return cs;
+
+out:
+	if (cpu_field)
+		bt_ctf_field_put(cpu_field);
+	if (stream)
+		bt_ctf_stream_put(stream);
+
+	free(cs);
+	return NULL;
+}
+
+static void ctf_stream__delete(struct ctf_stream *cs)
+{
+	if (cs) {
+		bt_ctf_stream_put(cs->stream);
+		free(cs);
+	}
+}
+
+static struct ctf_stream *ctf_stream(struct ctf_writer *cw, int cpu)
+{
+	struct ctf_stream *cs = cw->stream[cpu];
+
+	if (!cs) {
+		cs = ctf_stream__create(cw, cpu);
+		cw->stream[cpu] = cs;
+	}
+
+	return cs;
+}
+
+static int get_sample_cpu(struct ctf_writer *cw, struct perf_sample *sample,
+			  struct perf_evsel *evsel)
+{
+	int cpu = 0;
+
+	if (evsel->attr.sample_type & PERF_SAMPLE_CPU)
+		cpu = sample->cpu;
+
+	if (cpu > cw->stream_cnt) {
+		pr_err("Event was recorded for CPU %d, limit is at %d.\n",
+			cpu, cw->stream_cnt);
+		cpu = 0;
+	}
+
+	return cpu;
+}
+
+#define STREAM_FLUSH_COUNT 100000
+
+/*
+ * Currently we have no other way to determine the
+ * time for the stream flush other than keep track
+ * of the number of events and check it against
+ * threshold.
+ */
+static bool is_flush_needed(struct ctf_stream *cs)
+{
+	return cs->count >= STREAM_FLUSH_COUNT;
+}
+
 static int process_sample_event(struct perf_tool *tool,
 				union perf_event *_event __maybe_unused,
 				struct perf_sample *sample,
@@ -390,6 +561,7 @@ static int process_sample_event(struct perf_tool *tool,
 	struct convert *c = container_of(tool, struct convert, tool);
 	struct evsel_priv *priv = evsel->priv;
 	struct ctf_writer *cw = &c->writer;
+	struct ctf_stream *cs;
 	struct bt_ctf_event_class *event_class;
 	struct bt_ctf_event *event;
 	int ret;
@@ -424,9 +596,93 @@ static int process_sample_event(struct perf_tool *tool,
 			return -1;
 	}
 
-	bt_ctf_stream_append_event(cw->stream, event);
+	cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
+	if (cs) {
+		if (is_flush_needed(cs))
+			ctf_stream__flush(cs);
+
+		cs->count++;
+		bt_ctf_stream_append_event(cs->stream, event);
+	}
+
 	bt_ctf_event_put(event);
-	return 0;
+	return cs ? 0 : -1;
+}
+
+/* If dup < 0, add a prefix. Else, add _dupl_X suffix. */
+static char *change_name(char *name, char *orig_name, int dup)
+{
+	char *new_name = NULL;
+	size_t len;
+
+	if (!name)
+		name = orig_name;
+
+	if (dup >= 10)
+		goto out;
+	/*
+	 * Add '_' prefix to potential keywork.  According to
+	 * Mathieu Desnoyers (https://lkml.org/lkml/2015/1/23/652),
+	 * futher CTF spec updating may require us to use '$'.
+	 */
+	if (dup < 0)
+		len = strlen(name) + sizeof("_");
+	else
+		len = strlen(orig_name) + sizeof("_dupl_X");
+
+	new_name = malloc(len);
+	if (!new_name)
+		goto out;
+
+	if (dup < 0)
+		snprintf(new_name, len, "_%s", name);
+	else
+		snprintf(new_name, len, "%s_dupl_%d", orig_name, dup);
+
+out:
+	if (name != orig_name)
+		free(name);
+	return new_name;
+}
+
+static int event_class_add_field(struct bt_ctf_event_class *event_class,
+		struct bt_ctf_field_type *type,
+		struct format_field *field)
+{
+	struct bt_ctf_field_type *t = NULL;
+	char *name;
+	int dup = 1;
+	int ret;
+
+	/* alias was already assigned */
+	if (field->alias != field->name)
+		return bt_ctf_event_class_add_field(event_class, type,
+				(char *)field->alias);
+
+	name = field->name;
+
+	/* If 'name' is a keywork, add prefix. */
+	if (bt_ctf_validate_identifier(name))
+		name = change_name(name, field->name, -1);
+
+	if (!name) {
+		pr_err("Failed to fix invalid identifier.");
+		return -1;
+	}
+	while ((t = bt_ctf_event_class_get_field_by_name(event_class, name))) {
+		bt_ctf_field_type_put(t);
+		name = change_name(name, field->name, dup++);
+		if (!name) {
+			pr_err("Failed to create dup name for '%s'\n", field->name);
+			return -1;
+		}
+	}
+
+	ret = bt_ctf_event_class_add_field(event_class, type, name);
+	if (!ret)
+		field->alias = name;
+
+	return ret;
 }
 
 static int add_tracepoint_fields_types(struct ctf_writer *cw,
@@ -457,14 +713,14 @@ static int add_tracepoint_fields_types(struct ctf_writer *cw,
 		if (flags & FIELD_IS_ARRAY)
 			type = bt_ctf_field_type_array_create(type, field->arraylen);
 
-		ret = bt_ctf_event_class_add_field(event_class, type,
-				field->name);
+		ret = event_class_add_field(event_class, type, field);
 
 		if (flags & FIELD_IS_ARRAY)
 			bt_ctf_field_type_put(type);
 
 		if (ret) {
-			pr_err("Failed to add field '%s\n", field->name);
+			pr_err("Failed to add field '%s': %d\n",
+					field->name, ret);
 			return -1;
 		}
 	}
@@ -508,7 +764,7 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 	do {								\
 		pr2("  field '%s'\n", n);				\
 		if (bt_ctf_event_class_add_field(cl, t, n)) {		\
-			pr_err("Failed to add field '%s;\n", n);	\
+			pr_err("Failed to add field '%s';\n", n);	\
 			return -1;					\
 		}							\
 	} while (0)
@@ -528,9 +784,6 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 	if (type & PERF_SAMPLE_STREAM_ID)
 		ADD_FIELD(event_class, cw->data.u64, "perf_stream_id");
 
-	if (type & PERF_SAMPLE_CPU)
-		ADD_FIELD(event_class, cw->data.u32, "perf_cpu");
-
 	if (type & PERF_SAMPLE_PERIOD)
 		ADD_FIELD(event_class, cw->data.u64, "perf_period");
 
@@ -604,6 +857,39 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
 	return 0;
 }
 
+static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
+{
+	struct ctf_stream **stream;
+	struct perf_header *ph = &session->header;
+	int ncpus;
+
+	/*
+	 * Try to get the number of cpus used in the data file,
+	 * if not present fallback to the MAX_CPUS.
+	 */
+	ncpus = ph->env.nr_cpus_avail ?: MAX_CPUS;
+
+	stream = zalloc(sizeof(*stream) * ncpus);
+	if (!stream) {
+		pr_err("Failed to allocate streams.\n");
+		return -ENOMEM;
+	}
+
+	cw->stream     = stream;
+	cw->stream_cnt = ncpus;
+	return 0;
+}
+
+static void free_streams(struct ctf_writer *cw)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < cw->stream_cnt; cpu++)
+		ctf_stream__delete(cw->stream[cpu]);
+
+	free(cw->stream);
+}
+
 static int ctf_writer__setup_env(struct ctf_writer *cw,
 				 struct perf_session *session)
 {
@@ -713,7 +999,7 @@ static void ctf_writer__cleanup(struct ctf_writer *cw)
 	ctf_writer__cleanup_data(cw);
 
 	bt_ctf_clock_put(cw->clock);
-	bt_ctf_stream_put(cw->stream);
+	free_streams(cw);
 	bt_ctf_stream_class_put(cw->stream_class);
 	bt_ctf_writer_put(cw->writer);
 
@@ -725,8 +1011,9 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path)
 {
 	struct bt_ctf_writer		*writer;
 	struct bt_ctf_stream_class	*stream_class;
-	struct bt_ctf_stream		*stream;
 	struct bt_ctf_clock		*clock;
+	struct bt_ctf_field_type	*pkt_ctx_type;
+	int				ret;
 
 	/* CTF writer */
 	writer = bt_ctf_writer_create(path);
@@ -767,14 +1054,15 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path)
 	if (ctf_writer__init_data(cw))
 		goto err_cleanup;
 
-	/* CTF stream instance */
-	stream = bt_ctf_writer_create_stream(writer, stream_class);
-	if (!stream) {
-		pr("Failed to create CTF stream.\n");
+	/* Add cpu_id for packet context */
+	pkt_ctx_type = bt_ctf_stream_class_get_packet_context_type(stream_class);
+	if (!pkt_ctx_type)
 		goto err_cleanup;
-	}
 
-	cw->stream = stream;
+	ret = bt_ctf_field_type_structure_add_field(pkt_ctx_type, cw->data.u32, "cpu_id");
+	bt_ctf_field_type_put(pkt_ctx_type);
+	if (ret)
+		goto err_cleanup;
 
 	/* CTF clock writer setup */
 	if (bt_ctf_writer_add_clock(writer, clock)) {
@@ -791,6 +1079,28 @@ err:
 	return -1;
 }
 
+static int ctf_writer__flush_streams(struct ctf_writer *cw)
+{
+	int cpu, ret = 0;
+
+	for (cpu = 0; cpu < cw->stream_cnt && !ret; cpu++)
+		ret = ctf_stream__flush(cw->stream[cpu]);
+
+	return ret;
+}
+
+static int convert__config(const char *var, const char *value, void *cb)
+{
+	struct convert *c = cb;
+
+	if (!strcmp(var, "convert.queue-size")) {
+		c->queue_size = perf_config_u64(var, value);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
 int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 {
 	struct perf_session *session;
@@ -817,6 +1127,8 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 	struct ctf_writer *cw = &c.writer;
 	int err = -1;
 
+	perf_config(convert__config, &c);
+
 	/* CTF writer */
 	if (ctf_writer__init(cw, path))
 		return -1;
@@ -826,6 +1138,11 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 	if (!session)
 		goto free_writer;
 
+	if (c.queue_size) {
+		ordered_events__set_alloc_size(&session->ordered_events,
+					       c.queue_size);
+	}
+
 	/* CTF writer env/clock setup  */
 	if (ctf_writer__setup_env(cw, session))
 		goto free_session;
@@ -834,9 +1151,14 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 	if (setup_events(cw, session))
 		goto free_session;
 
+	if (setup_streams(cw, session))
+		goto free_session;
+
 	err = perf_session__process_events(session);
 	if (!err)
-		err = bt_ctf_stream_flush(cw->stream);
+		err = ctf_writer__flush_streams(cw);
+	else
+		pr_err("Error during conversion.\n");
 
 	fprintf(stderr,
 		"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
@@ -847,11 +1169,15 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 		(double) c.events_size / 1024.0 / 1024.0,
 		c.events_count);
 
-	/* its all good */
-free_session:
 	perf_session__delete(session);
+	ctf_writer__cleanup(cw);
+
+	return err;
 
+free_session:
+	perf_session__delete(session);
 free_writer:
 	ctf_writer__cleanup(cw);
+	pr_err("Error during conversion setup.\n");
 	return err;
 }
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index fc0ddd5792a9..13d9ae0bd15c 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -4,6 +4,7 @@
 #include "symbol.h"
 #include "dso.h"
 #include "machine.h"
+#include "auxtrace.h"
 #include "util.h"
 #include "debug.h"
 
@@ -961,6 +962,7 @@ void dso__delete(struct dso *dso)
 	}
 
 	dso__data_close(dso);
+	auxtrace_cache__free(dso->auxtrace_cache);
 	dso_cache__free(&dso->data.cache);
 	dso__free_a2l(dso);
 	zfree(&dso->symsrc_filename);
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index e0901b4ed8de..3d79c749934c 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -126,6 +126,8 @@ struct dsos {
 	struct rb_root	 root;	/* rbtree root sorted by long name */
 };
 
+struct auxtrace_cache;
+
 struct dso {
 	struct list_head node;
 	struct rb_node	 rb_node;	/* rbtree node sorted by long name */
@@ -156,6 +158,7 @@ struct dso {
 	u16		 long_name_len;
 	u16		 short_name_len;
 	void		*dwfl;			/* DWARF debug info */
+	struct auxtrace_cache *auxtrace_cache;
 
 	/* dso data file */
 	struct {
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index ff866c4d2e2f..db526091f580 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -23,12 +23,17 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_FORK]			= "FORK",
 	[PERF_RECORD_READ]			= "READ",
 	[PERF_RECORD_SAMPLE]			= "SAMPLE",
+	[PERF_RECORD_AUX]			= "AUX",
+	[PERF_RECORD_ITRACE_START]		= "ITRACE_START",
 	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
 	[PERF_RECORD_HEADER_BUILD_ID]		= "BUILD_ID",
 	[PERF_RECORD_FINISHED_ROUND]		= "FINISHED_ROUND",
 	[PERF_RECORD_ID_INDEX]			= "ID_INDEX",
+	[PERF_RECORD_AUXTRACE_INFO]		= "AUXTRACE_INFO",
+	[PERF_RECORD_AUXTRACE]			= "AUXTRACE",
+	[PERF_RECORD_AUXTRACE_ERROR]		= "AUXTRACE_ERROR",
 };
 
 const char *perf_event__name(unsigned int id)
@@ -692,6 +697,22 @@ int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
 	return machine__process_lost_event(machine, event, sample);
 }
 
+int perf_event__process_aux(struct perf_tool *tool __maybe_unused,
+			    union perf_event *event,
+			    struct perf_sample *sample __maybe_unused,
+			    struct machine *machine)
+{
+	return machine__process_aux_event(machine, event);
+}
+
+int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine)
+{
+	return machine__process_itrace_start_event(machine, event);
+}
+
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
 {
 	return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %c %s\n",
@@ -755,6 +776,21 @@ int perf_event__process_exit(struct perf_tool *tool __maybe_unused,
 	return machine__process_exit_event(machine, event, sample);
 }
 
+size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s]\n",
+		       event->aux.aux_offset, event->aux.aux_size,
+		       event->aux.flags,
+		       event->aux.flags & PERF_AUX_FLAG_TRUNCATED ? "T" : "",
+		       event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "");
+}
+
+size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " pid: %u tid: %u\n",
+		       event->itrace_start.pid, event->itrace_start.tid);
+}
+
 size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -774,6 +810,12 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 	case PERF_RECORD_MMAP2:
 		ret += perf_event__fprintf_mmap2(event, fp);
 		break;
+	case PERF_RECORD_AUX:
+		ret += perf_event__fprintf_aux(event, fp);
+		break;
+	case PERF_RECORD_ITRACE_START:
+		ret += perf_event__fprintf_itrace_start(event, fp);
+		break;
 	default:
 		ret += fprintf(fp, "\n");
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 09b9e8d3fcf7..7eecd5e23d77 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -157,6 +157,8 @@ enum {
 	PERF_IP_FLAG_IN_TX		= 1ULL << 10,
 };
 
+#define PERF_IP_FLAG_CHARS "bcrosyiABEx"
+
 #define PERF_BRANCH_MASK		(\
 	PERF_IP_FLAG_BRANCH		|\
 	PERF_IP_FLAG_CALL		|\
@@ -215,9 +217,17 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_BUILD_ID		= 67,
 	PERF_RECORD_FINISHED_ROUND		= 68,
 	PERF_RECORD_ID_INDEX			= 69,
+	PERF_RECORD_AUXTRACE_INFO		= 70,
+	PERF_RECORD_AUXTRACE			= 71,
+	PERF_RECORD_AUXTRACE_ERROR		= 72,
 	PERF_RECORD_HEADER_MAX
 };
 
+enum auxtrace_error_type {
+	PERF_AUXTRACE_ERROR_ITRACE  = 1,
+	PERF_AUXTRACE_ERROR_MAX
+};
+
 /*
  * The kernel collects the number of events it couldn't send in a stretch and
  * when possible sends this number in a PERF_RECORD_LOST event. The number of
@@ -242,6 +252,7 @@ struct events_stats {
 	u32 nr_invalid_chains;
 	u32 nr_unknown_id;
 	u32 nr_unprocessable_samples;
+	u32 nr_auxtrace_errors[PERF_AUXTRACE_ERROR_MAX];
 };
 
 struct attr_event {
@@ -280,6 +291,50 @@ struct id_index_event {
 	struct id_index_entry entries[0];
 };
 
+struct auxtrace_info_event {
+	struct perf_event_header header;
+	u32 type;
+	u32 reserved__; /* For alignment */
+	u64 priv[];
+};
+
+struct auxtrace_event {
+	struct perf_event_header header;
+	u64 size;
+	u64 offset;
+	u64 reference;
+	u32 idx;
+	u32 tid;
+	u32 cpu;
+	u32 reserved__; /* For alignment */
+};
+
+#define MAX_AUXTRACE_ERROR_MSG 64
+
+struct auxtrace_error_event {
+	struct perf_event_header header;
+	u32 type;
+	u32 code;
+	u32 cpu;
+	u32 pid;
+	u32 tid;
+	u32 reserved__; /* For alignment */
+	u64 ip;
+	char msg[MAX_AUXTRACE_ERROR_MSG];
+};
+
+struct aux_event {
+	struct perf_event_header header;
+	u64	aux_offset;
+	u64	aux_size;
+	u64	flags;
+};
+
+struct itrace_start_event {
+	struct perf_event_header header;
+	u32 pid, tid;
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -295,6 +350,11 @@ union perf_event {
 	struct tracing_data_event	tracing_data;
 	struct build_id_event		build_id;
 	struct id_index_event		id_index;
+	struct auxtrace_info_event	auxtrace_info;
+	struct auxtrace_event		auxtrace;
+	struct auxtrace_error_event	auxtrace_error;
+	struct aux_event		aux;
+	struct itrace_start_event	itrace_start;
 };
 
 void perf_event__print_totals(void);
@@ -330,6 +390,14 @@ int perf_event__process_lost(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
+int perf_event__process_aux(struct perf_tool *tool,
+			    union perf_event *event,
+			    struct perf_sample *sample,
+			    struct machine *machine);
+int perf_event__process_itrace_start(struct perf_tool *tool,
+				     union perf_event *event,
+				     struct perf_sample *sample,
+				     struct machine *machine);
 int perf_event__process_mmap(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
@@ -387,6 +455,8 @@ size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 u64 kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 080be93eea96..7ec1bf93ab28 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -695,7 +695,7 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 
 static bool perf_mmap__empty(struct perf_mmap *md)
 {
-	return perf_mmap__read_head(md) == md->prev;
+	return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base;
 }
 
 static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx)
@@ -725,6 +725,34 @@ void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
 		perf_evlist__mmap_put(evlist, idx);
 }
 
+int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused,
+			       struct auxtrace_mmap_params *mp __maybe_unused,
+			       void *userpg __maybe_unused,
+			       int fd __maybe_unused)
+{
+	return 0;
+}
+
+void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused)
+{
+}
+
+void __weak auxtrace_mmap_params__init(
+			struct auxtrace_mmap_params *mp __maybe_unused,
+			off_t auxtrace_offset __maybe_unused,
+			unsigned int auxtrace_pages __maybe_unused,
+			bool auxtrace_overwrite __maybe_unused)
+{
+}
+
+void __weak auxtrace_mmap_params__set_idx(
+			struct auxtrace_mmap_params *mp __maybe_unused,
+			struct perf_evlist *evlist __maybe_unused,
+			int idx __maybe_unused,
+			bool per_cpu __maybe_unused)
+{
+}
+
 static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx)
 {
 	if (evlist->mmap[idx].base != NULL) {
@@ -732,6 +760,7 @@ static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx)
 		evlist->mmap[idx].base = NULL;
 		evlist->mmap[idx].refcnt = 0;
 	}
+	auxtrace_mmap__munmap(&evlist->mmap[idx].auxtrace_mmap);
 }
 
 void perf_evlist__munmap(struct perf_evlist *evlist)
@@ -759,6 +788,7 @@ static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
 struct mmap_params {
 	int prot;
 	int mask;
+	struct auxtrace_mmap_params auxtrace_mp;
 };
 
 static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
@@ -789,6 +819,10 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
 		return -1;
 	}
 
+	if (auxtrace_mmap__mmap(&evlist->mmap[idx].auxtrace_mmap,
+				&mp->auxtrace_mp, evlist->mmap[idx].base, fd))
+		return -1;
+
 	return 0;
 }
 
@@ -853,6 +887,9 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist,
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
 		int output = -1;
 
+		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, cpu,
+					      true);
+
 		for (thread = 0; thread < nr_threads; thread++) {
 			if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
 							thread, &output))
@@ -878,6 +915,9 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist,
 	for (thread = 0; thread < nr_threads; thread++) {
 		int output = -1;
 
+		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
+					      false);
+
 		if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
 						&output))
 			goto out_unmap;
@@ -960,10 +1000,8 @@ static long parse_pages_arg(const char *str, unsigned long min,
 	return pages;
 }
 
-int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
-				  int unset __maybe_unused)
+int __perf_evlist__parse_mmap_pages(unsigned int *mmap_pages, const char *str)
 {
-	unsigned int *mmap_pages = opt->value;
 	unsigned long max = UINT_MAX;
 	long pages;
 
@@ -980,20 +1018,32 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
 	return 0;
 }
 
+int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
+				  int unset __maybe_unused)
+{
+	return __perf_evlist__parse_mmap_pages(opt->value, str);
+}
+
 /**
- * perf_evlist__mmap - Create mmaps to receive events.
+ * perf_evlist__mmap_ex - Create mmaps to receive events.
  * @evlist: list of events
  * @pages: map length in pages
  * @overwrite: overwrite older events?
+ * @auxtrace_pages - auxtrace map length in pages
+ * @auxtrace_overwrite - overwrite older auxtrace data?
  *
  * If @overwrite is %false the user needs to signal event consumption using
  * perf_mmap__write_tail().  Using perf_evlist__mmap_read() does this
  * automatically.
  *
+ * Similarly, if @auxtrace_overwrite is %false the user needs to signal data
+ * consumption using auxtrace_mmap__write_tail().
+ *
  * Return: %0 on success, negative error code otherwise.
  */
-int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
-		      bool overwrite)
+int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
+			 bool overwrite, unsigned int auxtrace_pages,
+			 bool auxtrace_overwrite)
 {
 	struct perf_evsel *evsel;
 	const struct cpu_map *cpus = evlist->cpus;
@@ -1013,6 +1063,9 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 	pr_debug("mmap size %zuB\n", evlist->mmap_len);
 	mp.mask = evlist->mmap_len - page_size - 1;
 
+	auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->mmap_len,
+				   auxtrace_pages, auxtrace_overwrite);
+
 	evlist__for_each(evlist, evsel) {
 		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
 		    evsel->sample_id == NULL &&
@@ -1026,6 +1079,12 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 	return perf_evlist__mmap_per_cpu(evlist, &mp);
 }
 
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
+		      bool overwrite)
+{
+	return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false);
+}
+
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
 {
 	evlist->threads = thread_map__new_str(target->pid, target->tid,
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index b5cce95d644e..c07b1a94a724 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -8,6 +8,7 @@
 #include "event.h"
 #include "evsel.h"
 #include "util.h"
+#include "auxtrace.h"
 #include <unistd.h>
 
 struct pollfd;
@@ -28,6 +29,7 @@ struct perf_mmap {
 	int		 mask;
 	int		 refcnt;
 	u64		 prev;
+	struct auxtrace_mmap auxtrace_mmap;
 	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
 };
 
@@ -122,10 +124,14 @@ int perf_evlist__start_workload(struct perf_evlist *evlist);
 
 struct option;
 
+int __perf_evlist__parse_mmap_pages(unsigned int *mmap_pages, const char *str);
 int perf_evlist__parse_mmap_pages(const struct option *opt,
 				  const char *str,
 				  int unset);
 
+int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
+			 bool overwrite, unsigned int auxtrace_pages,
+			 bool auxtrace_overwrite);
 int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 		      bool overwrite);
 void perf_evlist__munmap(struct perf_evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 33e3fd8c2e68..c886b9f7a48d 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1121,6 +1121,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(sample_stack_user, p_unsigned);
 	PRINT_ATTRf(clockid, p_signed);
 	PRINT_ATTRf(sample_regs_intr, p_hex);
+	PRINT_ATTRf(aux_watermark, p_unsigned);
 
 	return ret;
 }
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 918fd8ae2d80..3f0d809d853a 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -869,6 +869,20 @@ static int write_branch_stack(int fd __maybe_unused,
 	return 0;
 }
 
+static int write_auxtrace(int fd, struct perf_header *h,
+			  struct perf_evlist *evlist __maybe_unused)
+{
+	struct perf_session *session;
+	int err;
+
+	session = container_of(h, struct perf_session, header);
+
+	err = auxtrace_index__write(fd, &session->auxtrace_index);
+	if (err < 0)
+		pr_err("Failed to write auxtrace index\n");
+	return err;
+}
+
 static void print_hostname(struct perf_header *ph, int fd __maybe_unused,
 			   FILE *fp)
 {
@@ -1151,6 +1165,12 @@ static void print_branch_stack(struct perf_header *ph __maybe_unused,
 	fprintf(fp, "# contains samples with branch stack\n");
 }
 
+static void print_auxtrace(struct perf_header *ph __maybe_unused,
+			   int fd __maybe_unused, FILE *fp)
+{
+	fprintf(fp, "# contains AUX area data (e.g. instruction trace)\n");
+}
+
 static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
 			       FILE *fp)
 {
@@ -1821,6 +1841,22 @@ out_free:
 	return ret;
 }
 
+static int process_auxtrace(struct perf_file_section *section,
+			    struct perf_header *ph, int fd,
+			    void *data __maybe_unused)
+{
+	struct perf_session *session;
+	int err;
+
+	session = container_of(ph, struct perf_session, header);
+
+	err = auxtrace_index__process(fd, section->size, session,
+				      ph->needs_swap);
+	if (err < 0)
+		pr_err("Failed to process auxtrace index\n");
+	return err;
+}
+
 struct feature_ops {
 	int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
 	void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1861,6 +1897,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPA(HEADER_BRANCH_STACK,	branch_stack),
 	FEAT_OPP(HEADER_PMU_MAPPINGS,	pmu_mappings),
 	FEAT_OPP(HEADER_GROUP_DESC,	group_desc),
+	FEAT_OPP(HEADER_AUXTRACE,	auxtrace),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 3bb90ac172a1..d4d57962c591 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -30,6 +30,7 @@ enum {
 	HEADER_BRANCH_STACK,
 	HEADER_PMU_MAPPINGS,
 	HEADER_GROUP_DESC,
+	HEADER_AUXTRACE,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cc22b9158b93..338770679863 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1163,7 +1163,7 @@ static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h
 		return;
 
 	/* force fold unfiltered entry for simplicity */
-	h->ms.unfolded = false;
+	h->unfolded = false;
 	h->row_offset = 0;
 	h->nr_rows = 0;
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 527e032e24f6..2f471105efb1 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -486,6 +486,22 @@ machine__module_dso(struct machine *machine, struct kmod_path *m,
 	return dso;
 }
 
+int machine__process_aux_event(struct machine *machine __maybe_unused,
+			       union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_aux(event, stdout);
+	return 0;
+}
+
+int machine__process_itrace_start_event(struct machine *machine __maybe_unused,
+					union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_itrace_start(event, stdout);
+	return 0;
+}
+
 struct map *machine__new_module(struct machine *machine, u64 start,
 				const char *filename)
 {
@@ -1331,6 +1347,11 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 		ret = machine__process_exit_event(machine, event, sample); break;
 	case PERF_RECORD_LOST:
 		ret = machine__process_lost_event(machine, event, sample); break;
+	case PERF_RECORD_AUX:
+		ret = machine__process_aux_event(machine, event); break;
+	case PERF_RECORD_ITRACE_START:
+		ret = machine__process_itrace_start_event(machine, event);
+		break;
 	default:
 		ret = -1;
 		break;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 6d64cedb9d1e..1d992961d5d1 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -81,6 +81,10 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
 				struct perf_sample *sample);
 int machine__process_lost_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
+int machine__process_aux_event(struct machine *machine,
+			       union perf_event *event);
+int machine__process_itrace_start_event(struct machine *machine,
+					union perf_event *event);
 int machine__process_mmap_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
 int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index a14f08f41686..cd0e335008b4 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -292,6 +292,11 @@ int map__load(struct map *map, symbol_filter_t filter)
 	return 0;
 }
 
+int __weak arch__compare_symbol_names(const char *namea, const char *nameb)
+{
+	return strcmp(namea, nameb);
+}
+
 struct symbol *map__find_symbol(struct map *map, u64 addr,
 				symbol_filter_t filter)
 {
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index ec19c59ca38e..4e0c729841ab 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -124,7 +124,7 @@ struct thread;
  */
 #define __map__for_each_symbol_by_name(map, sym_name, pos, filter)	\
 	for (pos = map__find_symbol_by_name(map, sym_name, filter);	\
-	     pos && strcmp(pos->name, sym_name) == 0;		\
+	     pos && arch__compare_symbol_names(pos->name, sym_name) == 0;	\
 	     pos = symbol__next_by_name(pos))
 
 #define map__for_each_symbol_by_name(map, sym_name, pos)		\
@@ -132,6 +132,7 @@ struct thread;
 
 typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
 
+int arch__compare_symbol_names(const char *namea, const char *nameb);
 void map__init(struct map *map, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso);
 struct map *map__new(struct machine *machine, u64 start, u64 len,
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index be0655388b38..80a50fdb6d8a 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -17,6 +17,7 @@
 #include "parse-events-flex.h"
 #include "pmu.h"
 #include "thread_map.h"
+#include "asm/bug.h"
 
 #define MAX_NAME_LEN 100
 
@@ -24,6 +25,12 @@
 extern int parse_events_debug;
 #endif
 int parse_events_parse(void *data, void *scanner);
+int parse_events_term__num(struct parse_events_term **term,
+			   int type_term, char *config, u64 num,
+			   YYLTYPE *loc_term, YYLTYPE *loc_val);
+int parse_events_term__str(struct parse_events_term **term,
+			   int type_term, char *config, char *str,
+			   YYLTYPE *loc_term, YYLTYPE *loc_val);
 
 static struct perf_pmu_event_symbol *perf_pmu_events_list;
 /*
@@ -538,16 +545,40 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx,
 	return add_event(list, idx, &attr, NULL);
 }
 
+static int check_type_val(struct parse_events_term *term,
+			  struct parse_events_error *err,
+			  int type)
+{
+	if (type == term->type_val)
+		return 0;
+
+	if (err) {
+		err->idx = term->err_val;
+		if (type == PARSE_EVENTS__TERM_TYPE_NUM)
+			err->str = strdup("expected numeric value");
+		else
+			err->str = strdup("expected string value");
+	}
+	return -EINVAL;
+}
+
 static int config_term(struct perf_event_attr *attr,
-		       struct parse_events_term *term)
+		       struct parse_events_term *term,
+		       struct parse_events_error *err)
 {
-#define CHECK_TYPE_VAL(type)					\
-do {								\
-	if (PARSE_EVENTS__TERM_TYPE_ ## type != term->type_val)	\
-		return -EINVAL;					\
+#define CHECK_TYPE_VAL(type)						   \
+do {									   \
+	if (check_type_val(term, err, PARSE_EVENTS__TERM_TYPE_ ## type)) \
+		return -EINVAL;						   \
 } while (0)
 
 	switch (term->type_term) {
+	case PARSE_EVENTS__TERM_TYPE_USER:
+		/*
+		 * Always succeed for sysfs terms, as we dont know
+		 * at this point what type they need to have.
+		 */
+		return 0;
 	case PARSE_EVENTS__TERM_TYPE_CONFIG:
 		CHECK_TYPE_VAL(NUM);
 		attr->config = term->val.num;
@@ -582,18 +613,20 @@ do {								\
 }
 
 static int config_attr(struct perf_event_attr *attr,
-		       struct list_head *head, int fail)
+		       struct list_head *head,
+		       struct parse_events_error *err)
 {
 	struct parse_events_term *term;
 
 	list_for_each_entry(term, head, list)
-		if (config_term(attr, term) && fail)
+		if (config_term(attr, term, err))
 			return -EINVAL;
 
 	return 0;
 }
 
-int parse_events_add_numeric(struct list_head *list, int *idx,
+int parse_events_add_numeric(struct parse_events_evlist *data,
+			     struct list_head *list,
 			     u32 type, u64 config,
 			     struct list_head *head_config)
 {
@@ -604,10 +637,10 @@ int parse_events_add_numeric(struct list_head *list, int *idx,
 	attr.config = config;
 
 	if (head_config &&
-	    config_attr(&attr, head_config, 1))
+	    config_attr(&attr, head_config, data->error))
 		return -EINVAL;
 
-	return add_event(list, idx, &attr, NULL);
+	return add_event(list, &data->idx, &attr, NULL);
 }
 
 static int parse_events__is_name_term(struct parse_events_term *term)
@@ -626,8 +659,9 @@ static char *pmu_event_name(struct list_head *head_terms)
 	return NULL;
 }
 
-int parse_events_add_pmu(struct list_head *list, int *idx,
-			 char *name, struct list_head *head_config)
+int parse_events_add_pmu(struct parse_events_evlist *data,
+			 struct list_head *list, char *name,
+			 struct list_head *head_config)
 {
 	struct perf_event_attr attr;
 	struct perf_pmu_info info;
@@ -647,7 +681,7 @@ int parse_events_add_pmu(struct list_head *list, int *idx,
 
 	if (!head_config) {
 		attr.type = pmu->type;
-		evsel = __add_event(list, idx, &attr, NULL, pmu->cpus);
+		evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus);
 		return evsel ? 0 : -ENOMEM;
 	}
 
@@ -658,13 +692,14 @@ int parse_events_add_pmu(struct list_head *list, int *idx,
 	 * Configure hardcoded terms first, no need to check
 	 * return value when called with fail == 0 ;)
 	 */
-	config_attr(&attr, head_config, 0);
+	if (config_attr(&attr, head_config, data->error))
+		return -EINVAL;
 
-	if (perf_pmu__config(pmu, &attr, head_config))
+	if (perf_pmu__config(pmu, &attr, head_config, data->error))
 		return -EINVAL;
 
-	evsel = __add_event(list, idx, &attr, pmu_event_name(head_config),
-			    pmu->cpus);
+	evsel = __add_event(list, &data->idx, &attr,
+			    pmu_event_name(head_config), pmu->cpus);
 	if (evsel) {
 		evsel->unit = info.unit;
 		evsel->scale = info.scale;
@@ -1019,11 +1054,13 @@ int parse_events_terms(struct list_head *terms, const char *str)
 	return ret;
 }
 
-int parse_events(struct perf_evlist *evlist, const char *str)
+int parse_events(struct perf_evlist *evlist, const char *str,
+		 struct parse_events_error *err)
 {
 	struct parse_events_evlist data = {
-		.list = LIST_HEAD_INIT(data.list),
-		.idx  = evlist->nr_entries,
+		.list  = LIST_HEAD_INIT(data.list),
+		.idx   = evlist->nr_entries,
+		.error = err,
 	};
 	int ret;
 
@@ -1044,16 +1081,87 @@ int parse_events(struct perf_evlist *evlist, const char *str)
 	return ret;
 }
 
+#define MAX_WIDTH 1000
+static int get_term_width(void)
+{
+	struct winsize ws;
+
+	get_term_dimensions(&ws);
+	return ws.ws_col > MAX_WIDTH ? MAX_WIDTH : ws.ws_col;
+}
+
+static void parse_events_print_error(struct parse_events_error *err,
+				     const char *event)
+{
+	const char *str = "invalid or unsupported event: ";
+	char _buf[MAX_WIDTH];
+	char *buf = (char *) event;
+	int idx = 0;
+
+	if (err->str) {
+		/* -2 for extra '' in the final fprintf */
+		int width       = get_term_width() - 2;
+		int len_event   = strlen(event);
+		int len_str, max_len, cut = 0;
+
+		/*
+		 * Maximum error index indent, we will cut
+		 * the event string if it's bigger.
+		 */
+		int max_err_idx = 10;
+
+		/*
+		 * Let's be specific with the message when
+		 * we have the precise error.
+		 */
+		str     = "event syntax error: ";
+		len_str = strlen(str);
+		max_len = width - len_str;
+
+		buf = _buf;
+
+		/* We're cutting from the beggining. */
+		if (err->idx > max_err_idx)
+			cut = err->idx - max_err_idx;
+
+		strncpy(buf, event + cut, max_len);
+
+		/* Mark cut parts with '..' on both sides. */
+		if (cut)
+			buf[0] = buf[1] = '.';
+
+		if ((len_event - cut) > max_len) {
+			buf[max_len - 1] = buf[max_len - 2] = '.';
+			buf[max_len] = 0;
+		}
+
+		idx = len_str + err->idx - cut;
+	}
+
+	fprintf(stderr, "%s'%s'\n", str, buf);
+	if (idx) {
+		fprintf(stderr, "%*s\\___ %s\n", idx + 1, "", err->str);
+		if (err->help)
+			fprintf(stderr, "\n%s\n", err->help);
+		free(err->str);
+		free(err->help);
+	}
+
+	fprintf(stderr, "Run 'perf list' for a list of valid events\n");
+}
+
+#undef MAX_WIDTH
+
 int parse_events_option(const struct option *opt, const char *str,
 			int unset __maybe_unused)
 {
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
-	int ret = parse_events(evlist, str);
+	struct parse_events_error err = { .idx = 0, };
+	int ret = parse_events(evlist, str, &err);
+
+	if (ret)
+		parse_events_print_error(&err, str);
 
-	if (ret) {
-		fprintf(stderr, "invalid or unsupported event: '%s'\n", str);
-		fprintf(stderr, "Run 'perf list' for a list of valid events\n");
-	}
 	return ret;
 }
 
@@ -1460,7 +1568,7 @@ int parse_events__is_hardcoded_term(struct parse_events_term *term)
 
 static int new_term(struct parse_events_term **_term, int type_val,
 		    int type_term, char *config,
-		    char *str, u64 num)
+		    char *str, u64 num, int err_term, int err_val)
 {
 	struct parse_events_term *term;
 
@@ -1472,6 +1580,8 @@ static int new_term(struct parse_events_term **_term, int type_val,
 	term->type_val  = type_val;
 	term->type_term = type_term;
 	term->config = config;
+	term->err_term = err_term;
+	term->err_val  = err_val;
 
 	switch (type_val) {
 	case PARSE_EVENTS__TERM_TYPE_NUM:
@@ -1490,17 +1600,23 @@ static int new_term(struct parse_events_term **_term, int type_val,
 }
 
 int parse_events_term__num(struct parse_events_term **term,
-			   int type_term, char *config, u64 num)
+			   int type_term, char *config, u64 num,
+			   YYLTYPE *loc_term, YYLTYPE *loc_val)
 {
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
-			config, NULL, num);
+			config, NULL, num,
+			loc_term ? loc_term->first_column : 0,
+			loc_val ? loc_val->first_column : 0);
 }
 
 int parse_events_term__str(struct parse_events_term **term,
-			   int type_term, char *config, char *str)
+			   int type_term, char *config, char *str,
+			   YYLTYPE *loc_term, YYLTYPE *loc_val)
 {
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
-			config, str, 0);
+			config, str, 0,
+			loc_term ? loc_term->first_column : 0,
+			loc_val ? loc_val->first_column : 0);
 }
 
 int parse_events_term__sym_hw(struct parse_events_term **term,
@@ -1514,18 +1630,20 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
 	if (config)
 		return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
 				PARSE_EVENTS__TERM_TYPE_USER, config,
-				(char *) sym->symbol, 0);
+				(char *) sym->symbol, 0, 0, 0);
 	else
 		return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
 				PARSE_EVENTS__TERM_TYPE_USER,
-				(char *) "event", (char *) sym->symbol, 0);
+				(char *) "event", (char *) sym->symbol,
+				0, 0, 0);
 }
 
 int parse_events_term__clone(struct parse_events_term **new,
 			     struct parse_events_term *term)
 {
 	return new_term(new, term->type_val, term->type_term, term->config,
-			term->val.str, term->val.num);
+			term->val.str, term->val.num,
+			term->err_term, term->err_val);
 }
 
 void parse_events__free_terms(struct list_head *terms)
@@ -1535,3 +1653,13 @@ void parse_events__free_terms(struct list_head *terms)
 	list_for_each_entry_safe(term, h, terms, list)
 		free(term);
 }
+
+void parse_events_evlist_error(struct parse_events_evlist *data,
+			       int idx, const char *str)
+{
+	struct parse_events_error *err = data->error;
+
+	err->idx = idx;
+	err->str = strdup(str);
+	WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 52a2dda4f954..e236f1b6ac6f 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -12,6 +12,7 @@
 struct list_head;
 struct perf_evsel;
 struct perf_evlist;
+struct parse_events_error;
 
 struct option;
 
@@ -29,7 +30,8 @@ const char *event_type(int type);
 
 extern int parse_events_option(const struct option *opt, const char *str,
 			       int unset);
-extern int parse_events(struct perf_evlist *evlist, const char *str);
+extern int parse_events(struct perf_evlist *evlist, const char *str,
+			struct parse_events_error *error);
 extern int parse_events_terms(struct list_head *terms, const char *str);
 extern int parse_filter(const struct option *opt, const char *str, int unset);
 
@@ -72,12 +74,23 @@ struct parse_events_term {
 	int type_term;
 	struct list_head list;
 	bool used;
+
+	/* error string indexes for within parsed string */
+	int err_term;
+	int err_val;
+};
+
+struct parse_events_error {
+	int   idx;	/* index in the parsed string */
+	char *str;      /* string to display at the index */
+	char *help;	/* optional help string */
 };
 
 struct parse_events_evlist {
-	struct list_head list;
-	int idx;
-	int nr_groups;
+	struct list_head	   list;
+	int			   idx;
+	int			   nr_groups;
+	struct parse_events_error *error;
 };
 
 struct parse_events_terms {
@@ -85,10 +98,6 @@ struct parse_events_terms {
 };
 
 int parse_events__is_hardcoded_term(struct parse_events_term *term);
-int parse_events_term__num(struct parse_events_term **_term,
-			   int type_term, char *config, u64 num);
-int parse_events_term__str(struct parse_events_term **_term,
-			   int type_term, char *config, char *str);
 int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx);
 int parse_events_term__clone(struct parse_events_term **new,
@@ -99,21 +108,24 @@ int parse_events__modifier_group(struct list_head *list, char *event_mod);
 int parse_events_name(struct list_head *list, char *name);
 int parse_events_add_tracepoint(struct list_head *list, int *idx,
 				char *sys, char *event);
-int parse_events_add_numeric(struct list_head *list, int *idx,
+int parse_events_add_numeric(struct parse_events_evlist *data,
+			     struct list_head *list,
 			     u32 type, u64 config,
 			     struct list_head *head_config);
 int parse_events_add_cache(struct list_head *list, int *idx,
 			   char *type, char *op_result1, char *op_result2);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
 				void *ptr, char *type, u64 len);
-int parse_events_add_pmu(struct list_head *list, int *idx,
-			 char *pmu , struct list_head *head_config);
+int parse_events_add_pmu(struct parse_events_evlist *data,
+			 struct list_head *list, char *name,
+			 struct list_head *head_config);
 enum perf_pmu_event_symbol_type
 perf_pmu__parse_check(const char *name);
 void parse_events__set_leader(char *name, struct list_head *list);
 void parse_events_update_lists(struct list_head *list_event,
 			       struct list_head *list_all);
-void parse_events_error(void *data, void *scanner, char const *msg);
+void parse_events_evlist_error(struct parse_events_evlist *data,
+			       int idx, const char *str);
 
 void print_events(const char *event_glob, bool name_only);
 
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 8895cf3132ab..09e738fe9ea2 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -3,6 +3,8 @@
 %option bison-bridge
 %option prefix="parse_events_"
 %option stack
+%option bison-locations
+%option yylineno
 
 %{
 #include <errno.h>
@@ -51,6 +53,18 @@ static int str(yyscan_t scanner, int token)
 	return token;
 }
 
+#define REWIND(__alloc)				\
+do {								\
+	YYSTYPE *__yylval = parse_events_get_lval(yyscanner);	\
+	char *text = parse_events_get_text(yyscanner);		\
+								\
+	if (__alloc)						\
+		__yylval->str = strdup(text);			\
+								\
+	yycolumn -= strlen(text);				\
+	yyless(0);						\
+} while (0)
+
 static int pmu_str_check(yyscan_t scanner)
 {
 	YYSTYPE *yylval = parse_events_get_lval(scanner);
@@ -85,6 +99,13 @@ static int term(yyscan_t scanner, int type)
 	return PE_TERM;
 }
 
+#define YY_USER_ACTION					\
+do {							\
+	yylloc->last_column  = yylloc->first_column;	\
+	yylloc->first_column = yycolumn;		\
+	yycolumn += yyleng;				\
+} while (0);
+
 %}
 
 %x mem
@@ -119,6 +140,12 @@ modifier_bp	[rwx]{1,3}
 
 		if (start_token) {
 			parse_events_set_extra(NULL, yyscanner);
+			/*
+			 * The flex parser does not init locations variable
+			 * via the scan_string interface, so we need do the
+			 * init in here.
+			 */
+			yycolumn = 0;
 			return start_token;
 		}
          }
@@ -127,24 +154,30 @@ modifier_bp	[rwx]{1,3}
 <event>{
 
 {group}		{
-			BEGIN(INITIAL); yyless(0);
+			BEGIN(INITIAL);
+			REWIND(0);
 		}
 
 {event_pmu}	|
 {event}		{
-			str(yyscanner, PE_EVENT_NAME);
-			BEGIN(INITIAL); yyless(0);
+			BEGIN(INITIAL);
+			REWIND(1);
 			return PE_EVENT_NAME;
 		}
 
 .		|
 <<EOF>>		{
-			BEGIN(INITIAL); yyless(0);
+			BEGIN(INITIAL);
+			REWIND(0);
 		}
 
 }
 
 <config>{
+	/*
+	 * Please update formats_error_string any time
+	 * new static term is added.
+	 */
 config			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
 config1			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
 config2			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); }
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 72def077dbbf..3d11e00243e3 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -2,6 +2,7 @@
 %parse-param {void *_data}
 %parse-param {void *scanner}
 %lex-param {void* scanner}
+%locations
 
 %{
 
@@ -14,8 +15,6 @@
 #include "parse-events.h"
 #include "parse-events-bison.h"
 
-extern int parse_events_lex (YYSTYPE* lvalp, void* scanner);
-
 #define ABORT_ON(val) \
 do { \
 	if (val) \
@@ -208,7 +207,7 @@ PE_NAME '/' event_config '/'
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(list, &data->idx, $1, $3));
+	ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
 	parse_events__free_terms($3);
 	$$ = list;
 }
@@ -219,7 +218,7 @@ PE_NAME '/' '/'
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(list, &data->idx, $1, NULL));
+	ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
 	$$ = list;
 }
 |
@@ -232,11 +231,11 @@ PE_KERNEL_PMU_EVENT sep_dc
 
 	ALLOC_LIST(head);
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, 1));
+					$1, 1, &@1, NULL));
 	list_add_tail(&term->list, head);
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(list, &data->idx, "cpu", head));
+	ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
 	parse_events__free_terms(head);
 	$$ = list;
 }
@@ -252,7 +251,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 
 	ALLOC_LIST(head);
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					&pmu_name, 1));
+					&pmu_name, 1, &@1, NULL));
 	list_add_tail(&term->list, head);
 
 	ALLOC_LIST(list);
@@ -275,8 +274,7 @@ value_sym '/' event_config '/'
 	int config = $1 & 255;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx,
-					  type, config, $3));
+	ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
 	parse_events__free_terms($3);
 	$$ = list;
 }
@@ -289,8 +287,7 @@ value_sym sep_slash_dc
 	int config = $1 & 255;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx,
-					  type, config, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, type, config, NULL));
 	$$ = list;
 }
 
@@ -389,7 +386,13 @@ PE_NAME ':' PE_NAME
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_tracepoint(list, &data->idx, $1, $3));
+	if (parse_events_add_tracepoint(list, &data->idx, $1, $3)) {
+		struct parse_events_error *error = data->error;
+
+		error->idx = @1.first_column;
+		error->str = strdup("unknown tracepoint");
+		return -1;
+	}
 	$$ = list;
 }
 
@@ -400,7 +403,7 @@ PE_VALUE ':' PE_VALUE
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx, (u32)$1, $3, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
 	$$ = list;
 }
 
@@ -411,8 +414,7 @@ PE_RAW
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx,
-					  PERF_TYPE_RAW, $1, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
 	$$ = list;
 }
 
@@ -450,7 +452,7 @@ PE_NAME '=' PE_NAME
 	struct parse_events_term *term;
 
 	ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, $3));
+					$1, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -459,7 +461,7 @@ PE_NAME '=' PE_VALUE
 	struct parse_events_term *term;
 
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, $3));
+					$1, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -477,7 +479,7 @@ PE_NAME
 	struct parse_events_term *term;
 
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, 1));
+					$1, 1, &@1, NULL));
 	$$ = term;
 }
 |
@@ -494,7 +496,7 @@ PE_TERM '=' PE_NAME
 {
 	struct parse_events_term *term;
 
-	ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -502,7 +504,7 @@ PE_TERM '=' PE_VALUE
 {
 	struct parse_events_term *term;
 
-	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -510,7 +512,7 @@ PE_TERM
 {
 	struct parse_events_term *term;
 
-	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
 	$$ = term;
 }
 
@@ -520,7 +522,9 @@ sep_slash_dc: '/' | ':' |
 
 %%
 
-void parse_events_error(void *data __maybe_unused, void *scanner __maybe_unused,
+void parse_events_error(YYLTYPE *loc, void *data,
+			void *scanner __maybe_unused,
 			char const *msg __maybe_unused)
 {
+	parse_events_evlist_error(data, loc->last_column, "parser error");
 }
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 59561fd86278..367d8b816cc7 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -123,6 +123,10 @@ struct option {
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
 #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
 #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
+#define OPT_STRING_OPTARG(s, l, v, a, h, d) \
+	{ .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
+	  .value = check_vtype(v, const char **), (a), .help = (h), \
+	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
 #define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
 #define OPT_DATE(s, l, v, h) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 48411674da0f..244c66f89891 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -579,6 +579,38 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
 	return -1;
 }
 
+static char *formats_error_string(struct list_head *formats)
+{
+	struct perf_pmu_format *format;
+	char *err, *str;
+	static const char *static_terms = "config,config1,config2,name,period,branch_type\n";
+	unsigned i = 0;
+
+	if (!asprintf(&str, "valid terms:"))
+		return NULL;
+
+	/* sysfs exported terms */
+	list_for_each_entry(format, formats, list) {
+		char c = i++ ? ',' : ' ';
+
+		err = str;
+		if (!asprintf(&str, "%s%c%s", err, c, format->name))
+			goto fail;
+		free(err);
+	}
+
+	/* static terms */
+	err = str;
+	if (!asprintf(&str, "%s,%s", err, static_terms))
+		goto fail;
+
+	free(err);
+	return str;
+fail:
+	free(err);
+	return NULL;
+}
+
 /*
  * Setup one of config[12] attr members based on the
  * user input data - term parameter.
@@ -587,7 +619,7 @@ static int pmu_config_term(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct parse_events_term *term,
 			   struct list_head *head_terms,
-			   bool zero)
+			   bool zero, struct parse_events_error *err)
 {
 	struct perf_pmu_format *format;
 	__u64 *vp;
@@ -611,6 +643,11 @@ static int pmu_config_term(struct list_head *formats,
 	if (!format) {
 		if (verbose)
 			printf("Invalid event/parameter '%s'\n", term->config);
+		if (err) {
+			err->idx  = term->err_term;
+			err->str  = strdup("unknown term");
+			err->help = formats_error_string(formats);
+		}
 		return -EINVAL;
 	}
 
@@ -636,9 +673,14 @@ static int pmu_config_term(struct list_head *formats,
 		val = term->val.num;
 	else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
 		if (strcmp(term->val.str, "?")) {
-			if (verbose)
+			if (verbose) {
 				pr_info("Invalid sysfs entry %s=%s\n",
 						term->config, term->val.str);
+			}
+			if (err) {
+				err->idx = term->err_val;
+				err->str = strdup("expected numeric value");
+			}
 			return -EINVAL;
 		}
 
@@ -654,12 +696,13 @@ static int pmu_config_term(struct list_head *formats,
 int perf_pmu__config_terms(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms,
-			   bool zero)
+			   bool zero, struct parse_events_error *err)
 {
 	struct parse_events_term *term;
 
 	list_for_each_entry(term, head_terms, list) {
-		if (pmu_config_term(formats, attr, term, head_terms, zero))
+		if (pmu_config_term(formats, attr, term, head_terms,
+				    zero, err))
 			return -EINVAL;
 	}
 
@@ -672,12 +715,14 @@ int perf_pmu__config_terms(struct list_head *formats,
  * 2) pmu format definitions - specified by pmu parameter
  */
 int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
-		     struct list_head *head_terms)
+		     struct list_head *head_terms,
+		     struct parse_events_error *err)
 {
 	bool zero = !!pmu->default_config;
 
 	attr->type = pmu->type;
-	return perf_pmu__config_terms(&pmu->format, attr, head_terms, zero);
+	return perf_pmu__config_terms(&pmu->format, attr, head_terms,
+				      zero, err);
 }
 
 static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 6b1249fbdb5f..7b9c8cf8ae3e 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -4,6 +4,7 @@
 #include <linux/bitmap.h>
 #include <linux/perf_event.h>
 #include <stdbool.h>
+#include "parse-events.h"
 
 enum {
 	PERF_PMU_FORMAT_VALUE_CONFIG,
@@ -47,11 +48,12 @@ struct perf_pmu_alias {
 
 struct perf_pmu *perf_pmu__find(const char *name);
 int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
-		     struct list_head *head_terms);
+		     struct list_head *head_terms,
+		     struct parse_events_error *error);
 int perf_pmu__config_terms(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms,
-			   bool zero);
+			   bool zero, struct parse_events_error *error);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
 			  struct perf_pmu_info *info);
 struct list_head *perf_pmu__alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index d8bb616ff57c..abf5845a2acc 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1077,6 +1077,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 	struct perf_probe_point *pp = &pev->point;
 	char *ptr, *tmp;
 	char c, nc = 0;
+	bool file_spec = false;
 	/*
 	 * <Syntax>
 	 * perf probe [EVENT=]SRC[:LN|;PTN]
@@ -1105,6 +1106,23 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 		arg = tmp;
 	}
 
+	/*
+	 * Check arg is function or file name and copy it.
+	 *
+	 * We consider arg to be a file spec if and only if it satisfies
+	 * all of the below criteria::
+	 * - it does not include any of "+@%",
+	 * - it includes one of ":;", and
+	 * - it has a period '.' in the name.
+	 *
+	 * Otherwise, we consider arg to be a function specification.
+	 */
+	if (!strpbrk(arg, "+@%") && (ptr = strpbrk(arg, ";:")) != NULL) {
+		/* This is a file spec if it includes a '.' before ; or : */
+		if (memchr(arg, '.', ptr - arg))
+			file_spec = true;
+	}
+
 	ptr = strpbrk(arg, ";:+@%");
 	if (ptr) {
 		nc = *ptr;
@@ -1115,10 +1133,9 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 	if (tmp == NULL)
 		return -ENOMEM;
 
-	/* Check arg is function or file and copy it */
-	if (strchr(tmp, '.'))	/* File */
+	if (file_spec)
 		pp->file = tmp;
-	else			/* Function */
+	else
 		pp->function = tmp;
 
 	/* Parse other options */
@@ -2129,7 +2146,23 @@ static int show_perf_probe_event(struct perf_probe_event *pev,
 	return ret;
 }
 
-static int __show_perf_probe_events(int fd, bool is_kprobe)
+static bool filter_probe_trace_event(struct probe_trace_event *tev,
+				     struct strfilter *filter)
+{
+	char tmp[128];
+
+	/* At first, check the event name itself */
+	if (strfilter__compare(filter, tev->event))
+		return true;
+
+	/* Next, check the combination of name and group */
+	if (e_snprintf(tmp, 128, "%s:%s", tev->group, tev->event) < 0)
+		return false;
+	return strfilter__compare(filter, tmp);
+}
+
+static int __show_perf_probe_events(int fd, bool is_kprobe,
+				    struct strfilter *filter)
 {
 	int ret = 0;
 	struct probe_trace_event tev;
@@ -2147,12 +2180,15 @@ static int __show_perf_probe_events(int fd, bool is_kprobe)
 	strlist__for_each(ent, rawlist) {
 		ret = parse_probe_trace_command(ent->s, &tev);
 		if (ret >= 0) {
+			if (!filter_probe_trace_event(&tev, filter))
+				goto next;
 			ret = convert_to_perf_probe_event(&tev, &pev,
 								is_kprobe);
 			if (ret >= 0)
 				ret = show_perf_probe_event(&pev,
 							    tev.point.module);
 		}
+next:
 		clear_perf_probe_event(&pev);
 		clear_probe_trace_event(&tev);
 		if (ret < 0)
@@ -2164,7 +2200,7 @@ static int __show_perf_probe_events(int fd, bool is_kprobe)
 }
 
 /* List up current perf-probe events */
-int show_perf_probe_events(void)
+int show_perf_probe_events(struct strfilter *filter)
 {
 	int kp_fd, up_fd, ret;
 
@@ -2176,7 +2212,7 @@ int show_perf_probe_events(void)
 
 	kp_fd = open_kprobe_events(false);
 	if (kp_fd >= 0) {
-		ret = __show_perf_probe_events(kp_fd, true);
+		ret = __show_perf_probe_events(kp_fd, true, filter);
 		close(kp_fd);
 		if (ret < 0)
 			goto out;
@@ -2190,7 +2226,7 @@ int show_perf_probe_events(void)
 	}
 
 	if (up_fd >= 0) {
-		ret = __show_perf_probe_events(up_fd, false);
+		ret = __show_perf_probe_events(up_fd, false, filter);
 		close(up_fd);
 	}
 out:
@@ -2265,6 +2301,9 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
 {
 	int i, ret;
 
+	if (*base == '.')
+		base++;
+
 	/* Try no suffix */
 	ret = e_snprintf(buf, len, "%s", base);
 	if (ret < 0) {
@@ -2447,6 +2486,10 @@ static int find_probe_functions(struct map *map, char *name)
 #define strdup_or_goto(str, label)	\
 	({ char *__p = strdup(str); if (!__p) goto label; __p; })
 
+void __weak arch__fix_tev_from_maps(struct perf_probe_event *pev __maybe_unused,
+				struct probe_trace_event *tev __maybe_unused,
+				struct map *map __maybe_unused) { }
+
 /*
  * Find probe function addresses from map.
  * Return an error or the number of found probe_trace_event
@@ -2553,6 +2596,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 					strdup_or_goto(pev->args[i].type,
 							nomem_out);
 		}
+		arch__fix_tev_from_maps(pev, tev, map);
 	}
 
 out:
@@ -2567,6 +2611,8 @@ err_out:
 	goto out;
 }
 
+bool __weak arch__prefers_symtab(void) { return false; }
+
 static int convert_to_probe_trace_events(struct perf_probe_event *pev,
 					  struct probe_trace_event **tevs,
 					  int max_tevs, const char *target)
@@ -2582,6 +2628,12 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
 		}
 	}
 
+	if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) {
+		ret = find_probe_trace_events_from_map(pev, tevs, max_tevs, target);
+		if (ret > 0)
+			return ret; /* Found in symbol table */
+	}
+
 	/* Convert perf_probe_event with debuginfo */
 	ret = try_to_find_probe_trace_events(pev, tevs, max_tevs, target);
 	if (ret != 0)
@@ -2682,40 +2734,39 @@ error:
 	return ret;
 }
 
-static int del_trace_probe_event(int fd, const char *buf,
-						  struct strlist *namelist)
+static int del_trace_probe_events(int fd, struct strfilter *filter,
+				  struct strlist *namelist)
 {
-	struct str_node *ent, *n;
-	int ret = -1;
+	struct str_node *ent;
+	const char *p;
+	int ret = -ENOENT;
 
-	if (strpbrk(buf, "*?")) { /* Glob-exp */
-		strlist__for_each_safe(ent, n, namelist)
-			if (strglobmatch(ent->s, buf)) {
-				ret = __del_trace_probe_event(fd, ent);
-				if (ret < 0)
-					break;
-				strlist__remove(namelist, ent);
-			}
-	} else {
-		ent = strlist__find(namelist, buf);
-		if (ent) {
+	if (!namelist)
+		return -ENOENT;
+
+	strlist__for_each(ent, namelist) {
+		p = strchr(ent->s, ':');
+		if ((p && strfilter__compare(filter, p + 1)) ||
+		    strfilter__compare(filter, ent->s)) {
 			ret = __del_trace_probe_event(fd, ent);
-			if (ret >= 0)
-				strlist__remove(namelist, ent);
+			if (ret < 0)
+				break;
 		}
 	}
 
 	return ret;
 }
 
-int del_perf_probe_events(struct strlist *dellist)
+int del_perf_probe_events(struct strfilter *filter)
 {
-	int ret = -1, ufd = -1, kfd = -1;
-	char buf[128];
-	const char *group, *event;
-	char *p, *str;
-	struct str_node *ent;
+	int ret, ret2, ufd = -1, kfd = -1;
 	struct strlist *namelist = NULL, *unamelist = NULL;
+	char *str = strfilter__string(filter);
+
+	if (!str)
+		return -EINVAL;
+
+	pr_debug("Delete filter: \'%s\'\n", str);
 
 	/* Get current event names */
 	kfd = open_kprobe_events(true);
@@ -2728,48 +2779,21 @@ int del_perf_probe_events(struct strlist *dellist)
 
 	if (kfd < 0 && ufd < 0) {
 		print_both_open_warning(kfd, ufd);
+		ret = kfd;
 		goto error;
 	}
 
-	if (namelist == NULL && unamelist == NULL)
+	ret = del_trace_probe_events(kfd, filter, namelist);
+	if (ret < 0 && ret != -ENOENT)
 		goto error;
 
-	strlist__for_each(ent, dellist) {
-		str = strdup(ent->s);
-		if (str == NULL) {
-			ret = -ENOMEM;
-			goto error;
-		}
-		pr_debug("Parsing: %s\n", str);
-		p = strchr(str, ':');
-		if (p) {
-			group = str;
-			*p = '\0';
-			event = p + 1;
-		} else {
-			group = "*";
-			event = str;
-		}
-
-		ret = e_snprintf(buf, 128, "%s:%s", group, event);
-		if (ret < 0) {
-			pr_err("Failed to copy event.");
-			free(str);
-			goto error;
-		}
-
-		pr_debug("Group: %s, Event: %s\n", group, event);
-
-		if (namelist)
-			ret = del_trace_probe_event(kfd, buf, namelist);
-
-		if (unamelist && ret != 0)
-			ret = del_trace_probe_event(ufd, buf, unamelist);
-
-		if (ret != 0)
-			pr_info("Info: Event \"%s\" does not exist.\n", buf);
-
-		free(str);
+	ret2 = del_trace_probe_events(ufd, filter, unamelist);
+	if (ret2 < 0 && ret2 != -ENOENT)
+		ret = ret2;
+	else if (ret == -ENOENT && ret2 == -ENOENT) {
+		pr_debug("\"%s\" does not hit any event.\n", str);
+		/* Note that this is silently ignored */
+		ret = 0;
 	}
 
 error:
@@ -2782,6 +2806,7 @@ error:
 		strlist__delete(unamelist);
 		close(ufd);
 	}
+	free(str);
 
 	return ret;
 }
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index d6b783447be9..e10aedc34570 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -126,8 +126,8 @@ extern const char *kernel_get_module_path(const char *module);
 
 extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 				 int max_probe_points, bool force_add);
-extern int del_perf_probe_events(struct strlist *dellist);
-extern int show_perf_probe_events(void);
+extern int del_perf_probe_events(struct strfilter *filter);
+extern int show_perf_probe_events(struct strfilter *filter);
 extern int show_line_range(struct line_range *lr, const char *module,
 			   bool user);
 extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
@@ -135,6 +135,9 @@ extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			       struct strfilter *filter, bool externs);
 extern int show_available_funcs(const char *module, struct strfilter *filter,
 				bool user);
+bool arch__prefers_symtab(void);
+void arch__fix_tev_from_maps(struct perf_probe_event *pev,
+			     struct probe_trace_event *tev, struct map *map);
 
 /* Maximum index number of event-name postfix */
 #define MAX_EVENT_INDEX	1024
diff --git a/tools/perf/util/pstack.c b/tools/perf/util/pstack.c
index a126e6cc6e73..b234a6e3d0d4 100644
--- a/tools/perf/util/pstack.c
+++ b/tools/perf/util/pstack.c
@@ -74,3 +74,10 @@ void *pstack__pop(struct pstack *pstack)
 	pstack->entries[pstack->top] = NULL;
 	return ret;
 }
+
+void *pstack__peek(struct pstack *pstack)
+{
+	if (pstack->top == 0)
+		return NULL;
+	return pstack->entries[pstack->top - 1];
+}
diff --git a/tools/perf/util/pstack.h b/tools/perf/util/pstack.h
index c3cb6584d527..ded7f2e36624 100644
--- a/tools/perf/util/pstack.h
+++ b/tools/perf/util/pstack.h
@@ -10,5 +10,6 @@ bool pstack__empty(const struct pstack *pstack);
 void pstack__remove(struct pstack *pstack, void *key);
 void pstack__push(struct pstack *pstack, void *key);
 void *pstack__pop(struct pstack *pstack);
+void *pstack__peek(struct pstack *pstack);
 
 #endif /* _PERF_PSTACK_ */
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 8acd0df88b5c..d457c523a33d 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -20,7 +20,7 @@ static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str)
 	if (!evlist)
 		return -ENOMEM;
 
-	if (parse_events(evlist, str))
+	if (parse_events(evlist, str, NULL))
 		goto out_delete;
 
 	evsel = perf_evlist__first(evlist);
@@ -119,7 +119,16 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
 			evsel->attr.comm_exec = 1;
 	}
 
-	if (evlist->nr_entries > 1) {
+	if (opts->full_auxtrace) {
+		/*
+		 * Need to be able to synthesize and parse selected events with
+		 * arbitrary sample types, which requires always being able to
+		 * match the id.
+		 */
+		use_sample_identifier = perf_can_sample_identifier();
+		evlist__for_each(evlist, evsel)
+			perf_evsel__set_sample_id(evsel, use_sample_identifier);
+	} else if (evlist->nr_entries > 1) {
 		struct perf_evsel *first = perf_evlist__first(evlist);
 
 		evlist__for_each(evlist, evsel) {
@@ -207,7 +216,7 @@ bool perf_evlist__can_select_event(struct perf_evlist *evlist, const char *str)
 	if (!temp_evlist)
 		return false;
 
-	err = parse_events(temp_evlist, str);
+	err = parse_events(temp_evlist, str, NULL);
 	if (err)
 		goto out_delete;
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0c74012575ac..e722107f932a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -15,12 +15,13 @@
 #include "cpumap.h"
 #include "perf_regs.h"
 #include "asm/bug.h"
+#include "auxtrace.h"
 
-static int machines__deliver_event(struct machines *machines,
-				   struct perf_evlist *evlist,
-				   union perf_event *event,
-				   struct perf_sample *sample,
-				   struct perf_tool *tool, u64 file_offset);
+static int perf_session__deliver_event(struct perf_session *session,
+				       union perf_event *event,
+				       struct perf_sample *sample,
+				       struct perf_tool *tool,
+				       u64 file_offset);
 
 static int perf_session__open(struct perf_session *session)
 {
@@ -105,8 +106,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe,
 		return ret;
 	}
 
-	return machines__deliver_event(&session->machines, session->evlist, event->event,
-				       &sample, session->tool, event->file_offset);
+	return perf_session__deliver_event(session, event->event, &sample,
+					   session->tool, event->file_offset);
 }
 
 struct perf_session *perf_session__new(struct perf_data_file *file,
@@ -119,6 +120,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
 
 	session->repipe = repipe;
 	session->tool   = tool;
+	INIT_LIST_HEAD(&session->auxtrace_index);
 	machines__init(&session->machines);
 	ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
 
@@ -185,6 +187,8 @@ static void perf_session_env__delete(struct perf_session_env *env)
 
 void perf_session__delete(struct perf_session *session)
 {
+	auxtrace__free(session);
+	auxtrace_index__free(&session->auxtrace_index);
 	perf_session__destroy_kernel_maps(session);
 	perf_session__delete_threads(session);
 	perf_session_env__delete(&session->header.env);
@@ -262,6 +266,49 @@ static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
+				union perf_event *event __maybe_unused,
+				struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int skipn(int fd, off_t n)
+{
+	char buf[4096];
+	ssize_t ret;
+
+	while (n > 0) {
+		ret = read(fd, buf, min(n, (off_t)sizeof(buf)));
+		if (ret <= 0)
+			return ret;
+		n -= ret;
+	}
+
+	return 0;
+}
+
+static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
+				       union perf_event *event,
+				       struct perf_session *session
+				       __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	if (perf_data_file__is_pipe(session->file))
+		skipn(perf_data_file__fd(session->file), event->auxtrace.size);
+	return event->auxtrace.size;
+}
+
+static
+int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
+				      union perf_event *event __maybe_unused,
+				      struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -278,6 +325,10 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->exit = process_event_stub;
 	if (tool->lost == NULL)
 		tool->lost = perf_event__process_lost;
+	if (tool->aux == NULL)
+		tool->aux = perf_event__process_aux;
+	if (tool->itrace_start == NULL)
+		tool->itrace_start = perf_event__process_itrace_start;
 	if (tool->read == NULL)
 		tool->read = process_event_sample_stub;
 	if (tool->throttle == NULL)
@@ -298,6 +349,12 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 	}
 	if (tool->id_index == NULL)
 		tool->id_index = process_id_index_stub;
+	if (tool->auxtrace_info == NULL)
+		tool->auxtrace_info = process_event_auxtrace_info_stub;
+	if (tool->auxtrace == NULL)
+		tool->auxtrace = process_event_auxtrace_stub;
+	if (tool->auxtrace_error == NULL)
+		tool->auxtrace_error = process_event_auxtrace_error_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -390,6 +447,26 @@ static void perf_event__read_swap(union perf_event *event, bool sample_id_all)
 		swap_sample_id_all(event, &event->read + 1);
 }
 
+static void perf_event__aux_swap(union perf_event *event, bool sample_id_all)
+{
+	event->aux.aux_offset = bswap_64(event->aux.aux_offset);
+	event->aux.aux_size   = bswap_64(event->aux.aux_size);
+	event->aux.flags      = bswap_64(event->aux.flags);
+
+	if (sample_id_all)
+		swap_sample_id_all(event, &event->aux + 1);
+}
+
+static void perf_event__itrace_start_swap(union perf_event *event,
+					  bool sample_id_all)
+{
+	event->itrace_start.pid	 = bswap_32(event->itrace_start.pid);
+	event->itrace_start.tid	 = bswap_32(event->itrace_start.tid);
+
+	if (sample_id_all)
+		swap_sample_id_all(event, &event->itrace_start + 1);
+}
+
 static void perf_event__throttle_swap(union perf_event *event,
 				      bool sample_id_all)
 {
@@ -449,6 +526,7 @@ void perf_event__attr_swap(struct perf_event_attr *attr)
 	attr->branch_sample_type = bswap_64(attr->branch_sample_type);
 	attr->sample_regs_user	 = bswap_64(attr->sample_regs_user);
 	attr->sample_stack_user  = bswap_32(attr->sample_stack_user);
+	attr->aux_watermark	 = bswap_32(attr->aux_watermark);
 
 	swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64));
 }
@@ -478,6 +556,40 @@ static void perf_event__tracing_data_swap(union perf_event *event,
 	event->tracing_data.size = bswap_32(event->tracing_data.size);
 }
 
+static void perf_event__auxtrace_info_swap(union perf_event *event,
+					   bool sample_id_all __maybe_unused)
+{
+	size_t size;
+
+	event->auxtrace_info.type = bswap_32(event->auxtrace_info.type);
+
+	size = event->header.size;
+	size -= (void *)&event->auxtrace_info.priv - (void *)event;
+	mem_bswap_64(event->auxtrace_info.priv, size);
+}
+
+static void perf_event__auxtrace_swap(union perf_event *event,
+				      bool sample_id_all __maybe_unused)
+{
+	event->auxtrace.size      = bswap_64(event->auxtrace.size);
+	event->auxtrace.offset    = bswap_64(event->auxtrace.offset);
+	event->auxtrace.reference = bswap_64(event->auxtrace.reference);
+	event->auxtrace.idx       = bswap_32(event->auxtrace.idx);
+	event->auxtrace.tid       = bswap_32(event->auxtrace.tid);
+	event->auxtrace.cpu       = bswap_32(event->auxtrace.cpu);
+}
+
+static void perf_event__auxtrace_error_swap(union perf_event *event,
+					    bool sample_id_all __maybe_unused)
+{
+	event->auxtrace_error.type = bswap_32(event->auxtrace_error.type);
+	event->auxtrace_error.code = bswap_32(event->auxtrace_error.code);
+	event->auxtrace_error.cpu  = bswap_32(event->auxtrace_error.cpu);
+	event->auxtrace_error.pid  = bswap_32(event->auxtrace_error.pid);
+	event->auxtrace_error.tid  = bswap_32(event->auxtrace_error.tid);
+	event->auxtrace_error.ip   = bswap_64(event->auxtrace_error.ip);
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -492,11 +604,16 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_THROTTLE]		  = perf_event__throttle_swap,
 	[PERF_RECORD_UNTHROTTLE]	  = perf_event__throttle_swap,
 	[PERF_RECORD_SAMPLE]		  = perf_event__all64_swap,
+	[PERF_RECORD_AUX]		  = perf_event__aux_swap,
+	[PERF_RECORD_ITRACE_START]	  = perf_event__itrace_start_swap,
 	[PERF_RECORD_HEADER_ATTR]	  = perf_event__hdr_attr_swap,
 	[PERF_RECORD_HEADER_EVENT_TYPE]	  = perf_event__event_type_swap,
 	[PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
 	[PERF_RECORD_HEADER_BUILD_ID]	  = NULL,
 	[PERF_RECORD_ID_INDEX]		  = perf_event__all64_swap,
+	[PERF_RECORD_AUXTRACE_INFO]	  = perf_event__auxtrace_info_swap,
+	[PERF_RECORD_AUXTRACE]		  = perf_event__auxtrace_swap,
+	[PERF_RECORD_AUXTRACE_ERROR]	  = perf_event__auxtrace_error_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -938,12 +1055,34 @@ static int machines__deliver_event(struct machines *machines,
 		return tool->throttle(tool, event, sample, machine);
 	case PERF_RECORD_UNTHROTTLE:
 		return tool->unthrottle(tool, event, sample, machine);
+	case PERF_RECORD_AUX:
+		return tool->aux(tool, event, sample, machine);
+	case PERF_RECORD_ITRACE_START:
+		return tool->itrace_start(tool, event, sample, machine);
 	default:
 		++evlist->stats.nr_unknown_events;
 		return -1;
 	}
 }
 
+static int perf_session__deliver_event(struct perf_session *session,
+				       union perf_event *event,
+				       struct perf_sample *sample,
+				       struct perf_tool *tool,
+				       u64 file_offset)
+{
+	int ret;
+
+	ret = auxtrace__process_event(session, event, sample, tool);
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		return 0;
+
+	return machines__deliver_event(&session->machines, session->evlist,
+				       event, sample, tool, file_offset);
+}
+
 static s64 perf_session__process_user_event(struct perf_session *session,
 					    union perf_event *event,
 					    u64 file_offset)
@@ -980,6 +1119,15 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->finished_round(tool, event, oe);
 	case PERF_RECORD_ID_INDEX:
 		return tool->id_index(tool, event, session);
+	case PERF_RECORD_AUXTRACE_INFO:
+		return tool->auxtrace_info(tool, event, session);
+	case PERF_RECORD_AUXTRACE:
+		/* setup for reading amidst mmap */
+		lseek(fd, file_offset + event->header.size, SEEK_SET);
+		return tool->auxtrace(tool, event, session);
+	case PERF_RECORD_AUXTRACE_ERROR:
+		perf_session__auxtrace_error_inc(session, event);
+		return tool->auxtrace_error(tool, event, session);
 	default:
 		return -EINVAL;
 	}
@@ -1096,8 +1244,8 @@ static s64 perf_session__process_event(struct perf_session *session,
 			return ret;
 	}
 
-	return machines__deliver_event(&session->machines, evlist, event,
-				       &sample, tool, file_offset);
+	return perf_session__deliver_event(session, event, &sample, tool,
+					   file_offset);
 }
 
 void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1168,6 +1316,8 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
 
 	if (oe->nr_unordered_events != 0)
 		ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
+
+	events_stats__auxtrace_error_warn(stats);
 }
 
 volatile int session_done;
@@ -1256,10 +1406,14 @@ more:
 done:
 	/* do the final flush for ordered samples */
 	err = ordered_events__flush(oe, OE_FLUSH__FINAL);
+	if (err)
+		goto out_err;
+	err = auxtrace__flush_events(session, tool);
 out_err:
 	free(buf);
 	perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
+	auxtrace__free_events(session);
 	return err;
 }
 
@@ -1402,10 +1556,14 @@ more:
 out:
 	/* do the final flush for ordered samples */
 	err = ordered_events__flush(oe, OE_FLUSH__FINAL);
+	if (err)
+		goto out_err;
+	err = auxtrace__flush_events(session, tool);
 out_err:
 	ui_progress__finish();
 	perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
+	auxtrace__free_events(session);
 	session->one_mmap = false;
 	return err;
 }
@@ -1488,7 +1646,13 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp
 
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 {
-	size_t ret = fprintf(fp, "Aggregated stats:\n");
+	size_t ret;
+	const char *msg = "";
+
+	if (perf_header__has_feat(&session->header, HEADER_AUXTRACE))
+		msg = " (excludes AUX area (e.g. instruction trace) decoded / synthesized events)";
+
+	ret = fprintf(fp, "Aggregated stats:%s\n", msg);
 
 	ret += events_stats__fprintf(&session->evlist->stats, fp);
 	return ret;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index d5fa7b7916ef..b44afc75d1cc 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -15,10 +15,16 @@
 struct ip_callchain;
 struct thread;
 
+struct auxtrace;
+struct itrace_synth_opts;
+
 struct perf_session {
 	struct perf_header	header;
 	struct machines		machines;
 	struct perf_evlist	*evlist;
+	struct auxtrace		*auxtrace;
+	struct itrace_synth_opts *itrace_synth_opts;
+	struct list_head	auxtrace_index;
 	struct trace_event	tevent;
 	bool			repipe;
 	bool			one_mmap;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 846036a921dc..e97cd476d336 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -58,15 +58,16 @@ struct he_stat {
 
 struct hist_entry_diff {
 	bool	computed;
+	union {
+		/* PERF_HPP__DELTA */
+		double	period_ratio_delta;
 
-	/* PERF_HPP__DELTA */
-	double	period_ratio_delta;
-
-	/* PERF_HPP__RATIO */
-	double	period_ratio;
+		/* PERF_HPP__RATIO */
+		double	period_ratio;
 
-	/* HISTC_WEIGHTED_DIFF */
-	s64	wdiff;
+		/* HISTC_WEIGHTED_DIFF */
+		s64	wdiff;
+	};
 };
 
 /**
@@ -92,21 +93,28 @@ struct hist_entry {
 	s32			cpu;
 	u8			cpumode;
 
-	struct hist_entry_diff	diff;
-
 	/* We are added by hists__add_dummy_entry. */
 	bool			dummy;
 
-	/* XXX These two should move to some tree widget lib */
-	u16			row_offset;
-	u16			nr_rows;
-
-	bool			init_have_children;
 	char			level;
 	u8			filtered;
+	union {
+		/*
+		 * Since perf diff only supports the stdio output, TUI
+		 * fields are only accessed from perf report (or perf
+		 * top).  So make it an union to reduce memory usage.
+		 */
+		struct hist_entry_diff	diff;
+		struct /* for TUI */ {
+			u16	row_offset;
+			u16	nr_rows;
+			bool	init_have_children;
+			bool	unfolded;
+			bool	has_children;
+		};
+	};
 	char			*srcline;
 	struct symbol		*parent;
-	unsigned long		position;
 	struct rb_root		sorted_chain;
 	struct branch_info	*branch_info;
 	struct hists		*hists;
diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c
index 79a757a2a15c..bcae659b6546 100644
--- a/tools/perf/util/strfilter.c
+++ b/tools/perf/util/strfilter.c
@@ -170,6 +170,46 @@ struct strfilter *strfilter__new(const char *rules, const char **err)
 	return filter;
 }
 
+static int strfilter__append(struct strfilter *filter, bool _or,
+			     const char *rules, const char **err)
+{
+	struct strfilter_node *right, *root;
+	const char *ep = NULL;
+
+	if (!filter || !rules)
+		return -EINVAL;
+
+	right = strfilter_node__new(rules, &ep);
+	if (!right || *ep != '\0') {
+		if (err)
+			*err = ep;
+		goto error;
+	}
+	root = strfilter_node__alloc(_or ? OP_or : OP_and, filter->root, right);
+	if (!root) {
+		ep = NULL;
+		goto error;
+	}
+
+	filter->root = root;
+	return 0;
+
+error:
+	strfilter_node__delete(right);
+	return ep ? -EINVAL : -ENOMEM;
+}
+
+int strfilter__or(struct strfilter *filter, const char *rules, const char **err)
+{
+	return strfilter__append(filter, true, rules, err);
+}
+
+int strfilter__and(struct strfilter *filter, const char *rules,
+		   const char **err)
+{
+	return strfilter__append(filter, false, rules, err);
+}
+
 static bool strfilter_node__compare(struct strfilter_node *node,
 				    const char *str)
 {
@@ -197,3 +237,70 @@ bool strfilter__compare(struct strfilter *filter, const char *str)
 		return false;
 	return strfilter_node__compare(filter->root, str);
 }
+
+static int strfilter_node__sprint(struct strfilter_node *node, char *buf);
+
+/* sprint node in parenthesis if needed */
+static int strfilter_node__sprint_pt(struct strfilter_node *node, char *buf)
+{
+	int len;
+	int pt = node->r ? 2 : 0;	/* don't need to check node->l */
+
+	if (buf && pt)
+		*buf++ = '(';
+	len = strfilter_node__sprint(node, buf);
+	if (len < 0)
+		return len;
+	if (buf && pt)
+		*(buf + len) = ')';
+	return len + pt;
+}
+
+static int strfilter_node__sprint(struct strfilter_node *node, char *buf)
+{
+	int len = 0, rlen;
+
+	if (!node || !node->p)
+		return -EINVAL;
+
+	switch (*node->p) {
+	case '|':
+	case '&':
+		len = strfilter_node__sprint_pt(node->l, buf);
+		if (len < 0)
+			return len;
+	case '!':
+		if (buf) {
+			*(buf + len++) = *node->p;
+			buf += len;
+		} else
+			len++;
+		rlen = strfilter_node__sprint_pt(node->r, buf);
+		if (rlen < 0)
+			return rlen;
+		len += rlen;
+		break;
+	default:
+		len = strlen(node->p);
+		if (buf)
+			strcpy(buf, node->p);
+	}
+
+	return len;
+}
+
+char *strfilter__string(struct strfilter *filter)
+{
+	int len;
+	char *ret = NULL;
+
+	len = strfilter_node__sprint(filter->root, NULL);
+	if (len < 0)
+		return NULL;
+
+	ret = malloc(len + 1);
+	if (ret)
+		strfilter_node__sprint(filter->root, ret);
+
+	return ret;
+}
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h
index fe611f3c9e39..cff5eda88728 100644
--- a/tools/perf/util/strfilter.h
+++ b/tools/perf/util/strfilter.h
@@ -29,6 +29,32 @@ struct strfilter {
 struct strfilter *strfilter__new(const char *rules, const char **err);
 
 /**
+ * strfilter__or - Append an additional rule by logical-or
+ * @filter: Original string filter
+ * @rules: Filter rule to be appended at left of the root of
+ *         @filter by using logical-or.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and join it to the @filter by using logical-or.
+ * Return 0 if success, or return the error code.
+ */
+int strfilter__or(struct strfilter *filter,
+		  const char *rules, const char **err);
+
+/**
+ * strfilter__add - Append an additional rule by logical-and
+ * @filter: Original string filter
+ * @rules: Filter rule to be appended at left of the root of
+ *         @filter by using logical-and.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and join it to the @filter by using logical-and.
+ * Return 0 if success, or return the error code.
+ */
+int strfilter__and(struct strfilter *filter,
+		   const char *rules, const char **err);
+
+/**
  * strfilter__compare - compare given string and a string filter
  * @filter: String filter
  * @str: target string
@@ -45,4 +71,13 @@ bool strfilter__compare(struct strfilter *filter, const char *str);
  */
 void strfilter__delete(struct strfilter *filter);
 
+/**
+ * strfilter__string - Reconstruct a rule string from filter
+ * @filter: String filter to reconstruct
+ *
+ * Reconstruct a rule string from @filter. This will be good for
+ * debug messages. Note that returning string must be freed afterward.
+ */
+char *strfilter__string(struct strfilter *filter);
+
 #endif
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index a7ab6063e038..9d526a5312b1 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -630,6 +630,11 @@ void symsrc__destroy(struct symsrc *ss)
 	close(ss->fd);
 }
 
+bool __weak elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+	return ehdr.e_type == ET_EXEC || ehdr.e_type == ET_REL;
+}
+
 int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 		 enum dso_binary_type type)
 {
@@ -678,6 +683,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 		}
 
 		if (!dso__build_id_equal(dso, build_id)) {
+			pr_debug("%s: build id mismatch for %s.\n", __func__, name);
 			dso->load_errno = DSO_LOAD_ERRNO__MISMATCHING_BUILDID;
 			goto out_elf_end;
 		}
@@ -711,8 +717,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 						     ".gnu.prelink_undo",
 						     NULL) != NULL);
 	} else {
-		ss->adjust_symbols = ehdr.e_type == ET_EXEC ||
-				     ehdr.e_type == ET_REL;
+		ss->adjust_symbols = elf__needs_adjust_symbols(ehdr);
 	}
 
 	ss->name   = strdup(name);
@@ -771,6 +776,8 @@ static bool want_demangle(bool is_kernel_sym)
 	return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
 }
 
+void __weak arch__elf_sym_adjust(GElf_Sym *sym __maybe_unused) { }
+
 int dso__load_sym(struct dso *dso, struct map *map,
 		  struct symsrc *syms_ss, struct symsrc *runtime_ss,
 		  symbol_filter_t filter, int kmodule)
@@ -935,6 +942,8 @@ int dso__load_sym(struct dso *dso, struct map *map,
 		    (sym.st_value & 1))
 			--sym.st_value;
 
+		arch__elf_sym_adjust(&sym);
+
 		if (dso->kernel || kmodule) {
 			char dso_name[PATH_MAX];
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 201f6c4ca738..45ba48a7acb3 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -85,8 +85,17 @@ static int prefix_underscores_count(const char *str)
 	return tail - str;
 }
 
-#define SYMBOL_A 0
-#define SYMBOL_B 1
+int __weak arch__choose_best_symbol(struct symbol *syma,
+				    struct symbol *symb __maybe_unused)
+{
+	/* Avoid "SyS" kernel syscall aliases */
+	if (strlen(syma->name) >= 3 && !strncmp(syma->name, "SyS", 3))
+		return SYMBOL_B;
+	if (strlen(syma->name) >= 10 && !strncmp(syma->name, "compat_SyS", 10))
+		return SYMBOL_B;
+
+	return SYMBOL_A;
+}
 
 static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
 {
@@ -134,13 +143,7 @@ static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
 	else if (na < nb)
 		return SYMBOL_B;
 
-	/* Avoid "SyS" kernel syscall aliases */
-	if (na >= 3 && !strncmp(syma->name, "SyS", 3))
-		return SYMBOL_B;
-	if (na >= 10 && !strncmp(syma->name, "compat_SyS", 10))
-		return SYMBOL_B;
-
-	return SYMBOL_A;
+	return arch__choose_best_symbol(syma, symb);
 }
 
 void symbols__fixup_duplicate(struct rb_root *symbols)
@@ -408,7 +411,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
 		int cmp;
 
 		s = rb_entry(n, struct symbol_name_rb_node, rb_node);
-		cmp = strcmp(name, s->sym.name);
+		cmp = arch__compare_symbol_names(name, s->sym.name);
 
 		if (cmp < 0)
 			n = n->rb_left;
@@ -426,7 +429,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
 		struct symbol_name_rb_node *tmp;
 
 		tmp = rb_entry(n, struct symbol_name_rb_node, rb_node);
-		if (strcmp(tmp->sym.name, s->sym.name))
+		if (arch__compare_symbol_names(tmp->sym.name, s->sym.name))
 			break;
 
 		s = tmp;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 09561500164a..bef47ead1d9b 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -158,8 +158,6 @@ struct ref_reloc_sym {
 struct map_symbol {
 	struct map    *map;
 	struct symbol *sym;
-	bool	      unfolded;
-	bool	      has_children;
 };
 
 struct addr_map_symbol {
@@ -303,4 +301,14 @@ int setup_list(struct strlist **list, const char *list_str,
 int setup_intlist(struct intlist **list, const char *list_str,
 		  const char *list_name);
 
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
+void arch__elf_sym_adjust(GElf_Sym *sym);
+#endif
+
+#define SYMBOL_A 0
+#define SYMBOL_B 1
+
+int arch__choose_best_symbol(struct symbol *syma, struct symbol *symb);
+
 #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 51d9e56c0f84..7f282ad1d2bd 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -3,6 +3,8 @@
 
 #include <stdbool.h>
 
+#include <linux/types.h>
+
 struct perf_session;
 union perf_event;
 struct perf_evlist;
@@ -29,6 +31,9 @@ typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event,
 typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event,
 			struct ordered_events *oe);
 
+typedef s64 (*event_op3)(struct perf_tool *tool, union perf_event *event,
+			 struct perf_session *session);
+
 struct perf_tool {
 	event_sample	sample,
 			read;
@@ -38,13 +43,18 @@ struct perf_tool {
 			fork,
 			exit,
 			lost,
+			aux,
+			itrace_start,
 			throttle,
 			unthrottle;
 	event_attr_op	attr;
 	event_op2	tracing_data;
 	event_oe	finished_round;
 	event_op2	build_id,
-			id_index;
+			id_index,
+			auxtrace_info,
+			auxtrace_error;
+	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
 };