61 files changed, 1927 insertions, 1498 deletions
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index adc5a7e44b98..31824d5269cc 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -295,7 +295,10 @@ $(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml
 $(OUTPUT)%.xml : %.txt
 	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
 	$(ASCIIDOC) -b docbook -d manpage \
-		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) \
+		-aperf_date=$(shell git log -1 --pretty="format:%cd" \
+				--date=short $<) \
+		-o $@+ $< && \
 	mv $@+ $@
 
 XSLT = docbook.xsl
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index 2cf2d9e9d0da..fd9241a1b987 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -1,991 +1 @@
-Intel Processor Trace
-=====================
-
-Overview
-========
-
-Intel Processor Trace (Intel PT) is an extension of Intel Architecture that
-collects information about software execution such as control flow, execution
-modes and timings and formats it into highly compressed binary packets.
-Technical details are documented in the Intel 64 and IA-32 Architectures
-Software Developer Manuals, Chapter 36 Intel Processor Trace.
-
-Intel PT is first supported in Intel Core M and 5th generation Intel Core
-processors that are based on the Intel micro-architecture code name Broadwell.
-
-Trace data is collected by 'perf record' and stored within the perf.data file.
-See below for options to 'perf record'.
-
-Trace data must be 'decoded' which involves walking the object code and matching
-the trace data packets. For example a TNT packet only tells whether a
-conditional branch was taken or not taken, so to make use of that packet the
-decoder must know precisely which instruction was being executed.
-
-Decoding is done on-the-fly.  The decoder outputs samples in the same format as
-samples output by perf hardware events, for example as though the "instructions"
-or "branches" events had been recorded.  Presently 3 tools support this:
-'perf script', 'perf report' and 'perf inject'.  See below for more information
-on using those tools.
-
-The main distinguishing feature of Intel PT is that the decoder can determine
-the exact flow of software execution.  Intel PT can be used to understand why
-and how did software get to a certain point, or behave a certain way.  The
-software does not have to be recompiled, so Intel PT works with debug or release
-builds, however the executed images are needed - which makes use in JIT-compiled
-environments, or with self-modified code, a challenge.  Also symbols need to be
-provided to make sense of addresses.
-
-A limitation of Intel PT is that it produces huge amounts of trace data
-(hundreds of megabytes per second per core) which takes a long time to decode,
-for example two or three orders of magnitude longer than it took to collect.
-Another limitation is the performance impact of tracing, something that will
-vary depending on the use-case and architecture.
-
-
-Quickstart
-==========
-
-It is important to start small.  That is because it is easy to capture vastly
-more data than can possibly be processed.
-
-The simplest thing to do with Intel PT is userspace profiling of small programs.
-Data is captured with 'perf record' e.g. to trace 'ls' userspace-only:
-
-	perf record -e intel_pt//u ls
-
-And profiled with 'perf report' e.g.
-
-	perf report
-
-To also trace kernel space presents a problem, namely kernel self-modifying
-code.  A fairly good kernel image is available in /proc/kcore but to get an
-accurate image a copy of /proc/kcore needs to be made under the same conditions
-as the data capture.  A script perf-with-kcore can do that, but beware that the
-script makes use of 'sudo' to copy /proc/kcore.  If you have perf installed
-locally from the source tree you can do:
-
-	~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls
-
-which will create a directory named 'pt_ls' and put the perf.data file and
-copies of /proc/kcore, /proc/kallsyms and /proc/modules into it.  Then to use
-'perf report' becomes:
-
-	~/libexec/perf-core/perf-with-kcore report pt_ls
-
-Because samples are synthesized after-the-fact, the sampling period can be
-selected for reporting. e.g. sample every microsecond
-
-	~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge
-
-See the sections below for more information about the --itrace option.
-
-Beware the smaller the period, the more samples that are produced, and the
-longer it takes to process them.
-
-Also note that the coarseness of Intel PT timing information will start to
-distort the statistical value of the sampling as the sampling period becomes
-smaller.
-
-To represent software control flow, "branches" samples are produced.  By default
-a branch sample is synthesized for every single branch.  To get an idea what
-data is available you can use the 'perf script' tool with all itrace sampling
-options, which will list all the samples.
-
-	perf record -e intel_pt//u ls
-	perf script --itrace=ibxwpe
-
-An interesting field that is not printed by default is 'flags' which can be
-displayed as follows:
-
-	perf script --itrace=ibxwpe -F+flags
-
-The flags are "bcrosyiABEx" which stand for branch, call, return, conditional,
-system, asynchronous, interrupt, transaction abort, trace begin, trace end, and
-in transaction, respectively.
-
-Another interesting field that is not printed by default is 'ipc' which can be
-displayed as follows:
-
-	perf script --itrace=be -F+ipc
-
-There are two ways that instructions-per-cycle (IPC) can be calculated depending
-on the recording.
-
-If the 'cyc' config term (see config terms section below) was used, then IPC is
-calculated using the cycle count from CYC packets, otherwise MTC packets are
-used - refer to the 'mtc' config term.  When MTC is used, however, the values
-are less accurate because the timing is less accurate.
-
-Because Intel PT does not update the cycle count on every branch or instruction,
-the values will often be zero.  When there are values, they will be the number
-of instructions and number of cycles since the last update, and thus represent
-the average IPC since the last IPC for that event type.  Note IPC for "branches"
-events is calculated separately from IPC for "instructions" events.
-
-Also note that the IPC instruction count may or may not include the current
-instruction.  If the cycle count is associated with an asynchronous branch
-(e.g. page fault or interrupt), then the instruction count does not include the
-current instruction, otherwise it does.  That is consistent with whether or not
-that instruction has retired when the cycle count is updated.
-
-Another note, in the case of "branches" events, non-taken branches are not
-presently sampled, so IPC values for them do not appear e.g. a CYC packet with a
-TNT packet that starts with a non-taken branch.  To see every possible IPC
-value, "instructions" events can be used e.g. --itrace=i0ns
-
-While it is possible to create scripts to analyze the data, an alternative
-approach is available to export the data to a sqlite or postgresql database.
-Refer to script export-to-sqlite.py or export-to-postgresql.py for more details,
-and to script exported-sql-viewer.py for an example of using the database.
-
-There is also script intel-pt-events.py which provides an example of how to
-unpack the raw data for power events and PTWRITE.
-
-As mentioned above, it is easy to capture too much data.  One way to limit the
-data captured is to use 'snapshot' mode which is explained further below.
-Refer to 'new snapshot option' and 'Intel PT modes of operation' further below.
-
-Another problem that will be experienced is decoder errors.  They can be caused
-by inability to access the executed image, self-modified or JIT-ed code, or the
-inability to match side-band information (such as context switches and mmaps)
-which results in the decoder not knowing what code was executed.
-
-There is also the problem of perf not being able to copy the data fast enough,
-resulting in data lost because the buffer was full.  See 'Buffer handling' below
-for more details.
-
-
-perf record
-===========
-
-new event
----------
-
-The Intel PT kernel driver creates a new PMU for Intel PT.  PMU events are
-selected by providing the PMU name followed by the "config" separated by slashes.
-An enhancement has been made to allow default "config" e.g. the option
-
-	-e intel_pt//
-
-will use a default config value.  Currently that is the same as
-
-	-e intel_pt/tsc,noretcomp=0/
-
-which is the same as
-
-	-e intel_pt/tsc=1,noretcomp=0/
-
-Note there are now new config terms - see section 'config terms' further below.
-
-The config terms are listed in /sys/devices/intel_pt/format.  They are bit
-fields within the config member of the struct perf_event_attr which is
-passed to the kernel by the perf_event_open system call.  They correspond to bit
-fields in the IA32_RTIT_CTL MSR.  Here is a list of them and their definitions:
-
-	$ grep -H . /sys/bus/event_source/devices/intel_pt/format/*
-	/sys/bus/event_source/devices/intel_pt/format/cyc:config:1
-	/sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22
-	/sys/bus/event_source/devices/intel_pt/format/mtc:config:9
-	/sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17
-	/sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11
-	/sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27
-	/sys/bus/event_source/devices/intel_pt/format/tsc:config:10
-
-Note that the default config must be overridden for each term i.e.
-
-	-e intel_pt/noretcomp=0/
-
-is the same as:
-
-	-e intel_pt/tsc=1,noretcomp=0/
-
-So, to disable TSC packets use:
-
-	-e intel_pt/tsc=0/
-
-It is also possible to specify the config value explicitly:
-
-	-e intel_pt/config=0x400/
-
-Note that, as with all events, the event is suffixed with event modifiers:
-
-	u	userspace
-	k	kernel
-	h	hypervisor
-	G	guest
-	H	host
-	p	precise ip
-
-'h', 'G' and 'H' are for virtualization which is not supported by Intel PT.
-'p' is also not relevant to Intel PT.  So only options 'u' and 'k' are
-meaningful for Intel PT.
-
-perf_event_attr is displayed if the -vv option is used e.g.
-
-	------------------------------------------------------------
-	perf_event_attr:
-	type                             6
-	size                             112
-	config                           0x400
-	{ sample_period, sample_freq }   1
-	sample_type                      IP|TID|TIME|CPU|IDENTIFIER
-	read_format                      ID
-	disabled                         1
-	inherit                          1
-	exclude_kernel                   1
-	exclude_hv                       1
-	enable_on_exec                   1
-	sample_id_all                    1
-	------------------------------------------------------------
-	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
-	------------------------------------------------------------
-
-
-config terms
-------------
-
-The June 2015 version of Intel 64 and IA-32 Architectures Software Developer
-Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features.
-Some of the features are reflect in new config terms.  All the config terms are
-described below.
-
-tsc		Always supported.  Produces TSC timestamp packets to provide
-		timing information.  In some cases it is possible to decode
-		without timing information, for example a per-thread context
-		that does not overlap executable memory maps.
-
-		The default config selects tsc (i.e. tsc=1).
-
-noretcomp	Always supported.  Disables "return compression" so a TIP packet
-		is produced when a function returns.  Causes more packets to be
-		produced but might make decoding more reliable.
-
-		The default config does not select noretcomp (i.e. noretcomp=0).
-
-psb_period	Allows the frequency of PSB packets to be specified.
-
-		The PSB packet is a synchronization packet that provides a
-		starting point for decoding or recovery from errors.
-
-		Support for psb_period is indicated by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/psb_cyc
-
-		which contains "1" if the feature is supported and "0"
-		otherwise.
-
-		Valid values are given by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/psb_periods
-
-		which contains a hexadecimal value, the bits of which represent
-		valid values e.g. bit 2 set means value 2 is valid.
-
-		The psb_period value is converted to the approximate number of
-		trace bytes between PSB packets as:
-
-			2 ^ (value + 11)
-
-		e.g. value 3 means 16KiB bytes between PSBs
-
-		If an invalid value is entered, the error message
-		will give a list of valid values e.g.
-
-			$ perf record -e intel_pt/psb_period=15/u uname
-			Invalid psb_period for intel_pt. Valid values are: 0-5
-
-		If MTC packets are selected, the default config selects a value
-		of 3 (i.e. psb_period=3) or the nearest lower value that is
-		supported (0 is always supported).  Otherwise the default is 0.
-
-		If decoding is expected to be reliable and the buffer is large
-		then a large PSB period can be used.
-
-		Because a TSC packet is produced with PSB, the PSB period can
-		also affect the granularity to timing information in the absence
-		of MTC or CYC.
-
-mtc		Produces MTC timing packets.
-
-		MTC packets provide finer grain timestamp information than TSC
-		packets.  MTC packets record time using the hardware crystal
-		clock (CTC) which is related to TSC packets using a TMA packet.
-
-		Support for this feature is indicated by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/mtc
-
-		which contains "1" if the feature is supported and
-		"0" otherwise.
-
-		The frequency of MTC packets can also be specified - see
-		mtc_period below.
-
-mtc_period	Specifies how frequently MTC packets are produced - see mtc
-		above for how to determine if MTC packets are supported.
-
-		Valid values are given by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/mtc_periods
-
-		which contains a hexadecimal value, the bits of which represent
-		valid values e.g. bit 2 set means value 2 is valid.
-
-		The mtc_period value is converted to the MTC frequency as:
-
-			CTC-frequency / (2 ^ value)
-
-		e.g. value 3 means one eighth of CTC-frequency
-
-		Where CTC is the hardware crystal clock, the frequency of which
-		can be related to TSC via values provided in cpuid leaf 0x15.
-
-		If an invalid value is entered, the error message
-		will give a list of valid values e.g.
-
-			$ perf record -e intel_pt/mtc_period=15/u uname
-			Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9
-
-		The default value is 3 or the nearest lower value
-		that is supported (0 is always supported).
-
-cyc		Produces CYC timing packets.
-
-		CYC packets provide even finer grain timestamp information than
-		MTC and TSC packets.  A CYC packet contains the number of CPU
-		cycles since the last CYC packet. Unlike MTC and TSC packets,
-		CYC packets are only sent when another packet is also sent.
-
-		Support for this feature is indicated by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/psb_cyc
-
-		which contains "1" if the feature is supported and
-		"0" otherwise.
-
-		The number of CYC packets produced can be reduced by specifying
-		a threshold - see cyc_thresh below.
-
-cyc_thresh	Specifies how frequently CYC packets are produced - see cyc
-		above for how to determine if CYC packets are supported.
-
-		Valid cyc_thresh values are given by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds
-
-		which contains a hexadecimal value, the bits of which represent
-		valid values e.g. bit 2 set means value 2 is valid.
-
-		The cyc_thresh value represents the minimum number of CPU cycles
-		that must have passed before a CYC packet can be sent.  The
-		number of CPU cycles is:
-
-			2 ^ (value - 1)
-
-		e.g. value 4 means 8 CPU cycles must pass before a CYC packet
-		can be sent.  Note a CYC packet is still only sent when another
-		packet is sent, not at, e.g. every 8 CPU cycles.
-
-		If an invalid value is entered, the error message
-		will give a list of valid values e.g.
-
-			$ perf record -e intel_pt/cyc,cyc_thresh=15/u uname
-			Invalid cyc_thresh for intel_pt. Valid values are: 0-12
-
-		CYC packets are not requested by default.
-
-pt		Specifies pass-through which enables the 'branch' config term.
-
-		The default config selects 'pt' if it is available, so a user will
-		never need to specify this term.
-
-branch		Enable branch tracing.  Branch tracing is enabled by default so to
-		disable branch tracing use 'branch=0'.
-
-		The default config selects 'branch' if it is available.
-
-ptw		Enable PTWRITE packets which are produced when a ptwrite instruction
-		is executed.
-
-		Support for this feature is indicated by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/ptwrite
-
-		which contains "1" if the feature is supported and
-		"0" otherwise.
-
-fup_on_ptw	Enable a FUP packet to follow the PTWRITE packet.  The FUP packet
-		provides the address of the ptwrite instruction.  In the absence of
-		fup_on_ptw, the decoder will use the address of the previous branch
-		if branch tracing is enabled, otherwise the address will be zero.
-		Note that fup_on_ptw will work even when branch tracing is disabled.
-
-pwr_evt		Enable power events.  The power events provide information about
-		changes to the CPU C-state.
-
-		Support for this feature is indicated by:
-
-			/sys/bus/event_source/devices/intel_pt/caps/power_event_trace
-
-		which contains "1" if the feature is supported and
-		"0" otherwise.
-
-
-AUX area sampling option
-------------------------
-
-To select Intel PT "sampling" the AUX area sampling option can be used:
-
-	--aux-sample
-
-Optionally it can be followed by the sample size in bytes e.g.
-
-	--aux-sample=8192
-
-In addition, the Intel PT event to sample must be defined e.g.
-
-	-e intel_pt//u
-
-Samples on other events will be created containing Intel PT data e.g. the
-following will create Intel PT samples on the branch-misses event, note the
-events must be grouped using {}:
-
-	perf record --aux-sample -e '{intel_pt//u,branch-misses:u}'
-
-An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to
-events.  In this case, the grouping is implied e.g.
-
-	perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u
-
-is the same as:
-
-	perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}'
-
-but allows for also using an address filter e.g.:
-
-	perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls
-
-It is important to select a sample size that is big enough to contain at least
-one PSB packet.  If not a warning will be displayed:
-
-	Intel PT sample size (%zu) may be too small for PSB period (%zu)
-
-The calculation used for that is: if sample_size <= psb_period + 256 display the
-warning.  When sampling is used, psb_period defaults to 0 (2KiB).
-
-The default sample size is 4KiB.
-
-The sample size is passed in aux_sample_size in struct perf_event_attr.  The
-sample size is limited by the maximum event size which is 64KiB.  It is
-difficult to know how big the event might be without the trace sample attached,
-but the tool validates that the sample size is not greater than 60KiB.
-
-
-new snapshot option
--------------------
-
-The difference between full trace and snapshot from the kernel's perspective is
-that in full trace we don't overwrite trace data that the user hasn't collected
-yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let
-the trace run and overwrite older data in the buffer so that whenever something
-interesting happens, we can stop it and grab a snapshot of what was going on
-around that interesting moment.
-
-To select snapshot mode a new option has been added:
-
-	-S
-
-Optionally it can be followed by the snapshot size e.g.
-
-	-S0x100000
-
-The default snapshot size is the auxtrace mmap size.  If neither auxtrace mmap size
-nor snapshot size is specified, then the default is 4MiB for privileged users
-(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users.
-If an unprivileged user does not specify mmap pages, the mmap pages will be
-reduced as described in the 'new auxtrace mmap size option' section below.
-
-The snapshot size is displayed if the option -vv is used e.g.
-
-	Intel PT snapshot size: %zu
-
-
-new auxtrace mmap size option
----------------------------
-
-Intel PT buffer size is specified by an addition to the -m option e.g.
-
-	-m,16
-
-selects a buffer size of 16 pages i.e. 64KiB.
-
-Note that the existing functionality of -m is unchanged.  The auxtrace mmap size
-is specified by the optional addition of a comma and the value.
-
-The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users
-(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users.
-If an unprivileged user does not specify mmap pages, the mmap pages will be
-reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the
-user is likely to get an error as they exceed their mlock limit (Max locked
-memory as shown in /proc/self/limits).  Note that perf does not count the first
-512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu
-against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus
-their mlock limit (which defaults to 64KiB but is not multiplied by the number
-of cpus).
-
-In full-trace mode, powers of two are allowed for buffer size, with a minimum
-size of 2 pages.  In snapshot mode or sampling mode, it is the same but the
-minimum size is 1 page.
-
-The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g.
-
-	mmap length 528384
-	auxtrace mmap length 4198400
-
-
-Intel PT modes of operation
----------------------------
-
-Intel PT can be used in 2 modes:
-	full-trace mode
-	sample mode
-	snapshot mode
-
-Full-trace mode traces continuously e.g.
-
-	perf record -e intel_pt//u uname
-
-Sample mode attaches a Intel PT sample to other events e.g.
-
-	perf record --aux-sample -e intel_pt//u -e branch-misses:u
-
-Snapshot mode captures the available data when a signal is sent e.g.
-
-	perf record -v -e intel_pt//u -S ./loopy 1000000000 &
-	[1] 11435
-	kill -USR2 11435
-	Recording AUX area tracing snapshot
-
-Note that the signal sent is SIGUSR2.
-Note that "Recording AUX area tracing snapshot" is displayed because the -v
-option is used.
-
-The 2 modes cannot be used together.
-
-
-Buffer handling
----------------
-
-There may be buffer limitations (i.e. single ToPa entry) which means that actual
-buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER).  In order to
-provide other sizes, and in particular an arbitrarily large size, multiple
-buffers are logically concatenated.  However an interrupt must be used to switch
-between buffers.  That has two potential problems:
-	a) the interrupt may not be handled in time so that the current buffer
-	becomes full and some trace data is lost.
-	b) the interrupts may slow the system and affect the performance
-	results.
-
-If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event
-which the tools report as an error.
-
-In full-trace mode, the driver waits for data to be copied out before allowing
-the (logical) buffer to wrap-around.  If data is not copied out quickly enough,
-again 'truncated' is set in the PERF_RECORD_AUX event.  If the driver has to
-wait, the intel_pt event gets disabled.  Because it is difficult to know when
-that happens, perf tools always re-enable the intel_pt event after copying out
-data.
-
-
-Intel PT and build ids
-----------------------
-
-By default "perf record" post-processes the event stream to find all build ids
-for executables for all addresses sampled.  Deliberately, Intel PT is not
-decoded for that purpose (it would take too long).  Instead the build ids for
-all executables encountered (due to mmap, comm or task events) are included
-in the perf.data file.
-
-To see buildids included in the perf.data file use the command:
-
-	perf buildid-list
-
-If the perf.data file contains Intel PT data, that is the same as:
-
-	perf buildid-list --with-hits
-
-
-Snapshot mode and event disabling
----------------------------------
-
-In order to make a snapshot, the intel_pt event is disabled using an IOCTL,
-namely PERF_EVENT_IOC_DISABLE.  However doing that can also disable the
-collection of side-band information.  In order to prevent that,  a dummy
-software event has been introduced that permits tracking events (like mmaps) to
-continue to be recorded while intel_pt is disabled.  That is important to ensure
-there is complete side-band information to allow the decoding of subsequent
-snapshots.
-
-A test has been created for that.  To find the test:
-
-	perf test list
-	...
-	23: Test using a dummy software event to keep tracking
-
-To run the test:
-
-	perf test 23
-	23: Test using a dummy software event to keep tracking     : Ok
-
-
-perf record modes (nothing new here)
-------------------------------------
-
-perf record essentially operates in one of three modes:
-	per thread
-	per cpu
-	workload only
-
-"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a
-workload).
-"per cpu" is selected by -C or -a.
-"workload only" mode is selected by not using the other options but providing a
-command to run (i.e. the workload).
-
-In per-thread mode an exact list of threads is traced.  There is no inheritance.
-Each thread has its own event buffer.
-
-In per-cpu mode all processes (or processes from the selected cgroup i.e. -G
-option, or processes selected with -p or -u) are traced.  Each cpu has its own
-buffer. Inheritance is allowed.
-
-In workload-only mode, the workload is traced but with per-cpu buffers.
-Inheritance is allowed.  Note that you can now trace a workload in per-thread
-mode by using the --per-thread option.
-
-
-Privileged vs non-privileged users
-----------------------------------
-
-Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users
-have memory limits imposed upon them.  That affects what buffer sizes they can
-have as outlined above.
-
-The v4.2 kernel introduced support for a context switch metadata event,
-PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes
-are scheduled out and in, just not by whom, which is left for the
-PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context,
-which in turn requires CAP_SYS_ADMIN.
-
-Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context
-switches") commit, that introduces these metadata events for further info.
-
-When working with kernels < v4.2, the following considerations must be taken,
-as the sched:sched_switch tracepoints will be used to receive such information:
-
-Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are
-not permitted to use tracepoints which means there is insufficient side-band
-information to decode Intel PT in per-cpu mode, and potentially workload-only
-mode too if the workload creates new processes.
-
-Note also, that to use tracepoints, read-access to debugfs is required.  So if
-debugfs is not mounted or the user does not have read-access, it will again not
-be possible to decode Intel PT in per-cpu mode.
-
-
-sched_switch tracepoint
------------------------
-
-The sched_switch tracepoint is used to provide side-band data for Intel PT
-decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't
-available.
-
-The sched_switch events are automatically added. e.g. the second event shown
-below:
-
-	$ perf record -vv -e intel_pt//u uname
-	------------------------------------------------------------
-	perf_event_attr:
-	type                             6
-	size                             112
-	config                           0x400
-	{ sample_period, sample_freq }   1
-	sample_type                      IP|TID|TIME|CPU|IDENTIFIER
-	read_format                      ID
-	disabled                         1
-	inherit                          1
-	exclude_kernel                   1
-	exclude_hv                       1
-	enable_on_exec                   1
-	sample_id_all                    1
-	------------------------------------------------------------
-	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
-	------------------------------------------------------------
-	perf_event_attr:
-	type                             2
-	size                             112
-	config                           0x108
-	{ sample_period, sample_freq }   1
-	sample_type                      IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER
-	read_format                      ID
-	inherit                          1
-	sample_id_all                    1
-	exclude_guest                    1
-	------------------------------------------------------------
-	sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8
-	sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8
-	sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8
-	sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8
-	------------------------------------------------------------
-	perf_event_attr:
-	type                             1
-	size                             112
-	config                           0x9
-	{ sample_period, sample_freq }   1
-	sample_type                      IP|TID|TIME|IDENTIFIER
-	read_format                      ID
-	disabled                         1
-	inherit                          1
-	exclude_kernel                   1
-	exclude_hv                       1
-	mmap                             1
-	comm                             1
-	enable_on_exec                   1
-	task                             1
-	sample_id_all                    1
-	mmap2                            1
-	comm_exec                        1
-	------------------------------------------------------------
-	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
-	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
-	mmap size 528384B
-	AUX area mmap length 4194304
-	perf event ring buffer mmapped per cpu
-	Synthesizing auxtrace information
-	Linux
-	[ perf record: Woken up 1 times to write data ]
-	[ perf record: Captured and wrote 0.042 MB perf.data ]
-
-Note, the sched_switch event is only added if the user is permitted to use it
-and only in per-cpu mode.
-
-Note also, the sched_switch event is only added if TSC packets are requested.
-That is because, in the absence of timing information, the sched_switch events
-cannot be matched against the Intel PT trace.
-
-
-perf script
-===========
-
-By default, perf script will decode trace data found in the perf.data file.
-This can be further controlled by new option --itrace.
-
-
-New --itrace option
--------------------
-
-Having no option is the same as
-
-	--itrace
-
-which, in turn, is the same as
-
-	--itrace=cepwx
-
-The letters are:
-
-	i	synthesize "instructions" events
-	b	synthesize "branches" events
-	x	synthesize "transactions" events
-	w	synthesize "ptwrite" events
-	p	synthesize "power" events
-	c	synthesize branches events (calls only)
-	r	synthesize branches events (returns only)
-	e	synthesize tracing error events
-	d	create a debug log
-	g	synthesize a call chain (use with i or x)
-	l	synthesize last branch entries (use with i or x)
-	s	skip initial number of events
-
-"Instructions" events look like they were recorded by "perf record -e
-instructions".
-
-"Branches" events look like they were recorded by "perf record -e branches". "c"
-and "r" can be combined to get calls and returns.
-
-"Transactions" events correspond to the start or end of transactions. The
-'flags' field can be used in perf script to determine whether the event is a
-tranasaction start, commit or abort.
-
-Note that "instructions", "branches" and "transactions" events depend on code
-flow packets which can be disabled by using the config term "branch=0".  Refer
-to the config terms section above.
-
-"ptwrite" events record the payload of the ptwrite instruction and whether
-"fup_on_ptw" was used.  "ptwrite" events depend on PTWRITE packets which are
-recorded only if the "ptw" config term was used.  Refer to the config terms
-section above.  perf script "synth" field displays "ptwrite" information like
-this: "ip: 0 payload: 0x123456789abcdef0"  where "ip" is 1 if "fup_on_ptw" was
-used.
-
-"Power" events correspond to power event packets and CBR (core-to-bus ratio)
-packets.  While CBR packets are always recorded when tracing is enabled, power
-event packets are recorded only if the "pwr_evt" config term was used.  Refer to
-the config terms section above.  The power events record information about
-C-state changes, whereas CBR is indicative of CPU frequency.  perf script
-"event,synth" fields display information like this:
-	cbr:  cbr: 22 freq: 2189 MHz (200%)
-	mwait:  hints: 0x60 extensions: 0x1
-	pwre:  hw: 0 cstate: 2 sub-cstate: 0
-	exstop:  ip: 1
-	pwrx:  deepest cstate: 2 last cstate: 2 wake reason: 0x4
-Where:
-	"cbr" includes the frequency and the percentage of maximum non-turbo
-	"mwait" shows mwait hints and extensions
-	"pwre" shows C-state transitions (to a C-state deeper than C0) and
-	whether	initiated by hardware
-	"exstop" indicates execution stopped and whether the IP was recorded
-	exactly,
-	"pwrx" indicates return to C0
-For more details refer to the Intel 64 and IA-32 Architectures Software
-Developer Manuals.
-
-Error events show where the decoder lost the trace.  Error events
-are quite important.  Users must know if what they are seeing is a complete
-picture or not.
-
-The "d" option will cause the creation of a file "intel_pt.log" containing all
-decoded packets and instructions.  Note that this option slows down the decoder
-and that the resulting file may be very large.
-
-In addition, the period of the "instructions" event can be specified. e.g.
-
-	--itrace=i10us
-
-sets the period to 10us i.e. one  instruction sample is synthesized for each 10
-microseconds of trace.  Alternatives to "us" are "ms" (milliseconds),
-"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions).
-
-"ms", "us" and "ns" are converted to TSC ticks.
-
-The timing information included with Intel PT does not give the time of every
-instruction.  Consequently, for the purpose of sampling, the decoder estimates
-the time since the last timing packet based on 1 tick per instruction.  The time
-on the sample is *not* adjusted and reflects the last known value of TSC.
-
-For Intel PT, the default period is 100us.
-
-Setting it to a zero period means "as often as possible".
-
-In the case of Intel PT that is the same as a period of 1 and a unit of
-'instructions' (i.e. --itrace=i1i).
-
-Also the call chain size (default 16, max. 1024) for instructions or
-transactions events can be specified. e.g.
-
-	--itrace=ig32
-	--itrace=xg32
-
-Also the number of last branch entries (default 64, max. 1024) for instructions or
-transactions events can be specified. e.g.
-
-       --itrace=il10
-       --itrace=xl10
-
-Note that last branch entries are cleared for each sample, so there is no overlap
-from one sample to the next.
-
-To disable trace decoding entirely, use the option --no-itrace.
-
-It is also possible to skip events generated (instructions, branches, transactions)
-at the beginning. This is useful to ignore initialization code.
-
-	--itrace=i0nss1000000
-
-skips the first million instructions.
-
-dump option
------------
-
-perf script has an option (-D) to "dump" the events i.e. display the binary
-data.
-
-When -D is used, Intel PT packets are displayed.  The packet decoder does not
-pay attention to PSB packets, but just decodes the bytes - so the packets seen
-by the actual decoder may not be identical in places where the data is corrupt.
-One example of that would be when the buffer-switching interrupt has been too
-slow, and the buffer has been filled completely.  In that case, the last packet
-in the buffer might be truncated and immediately followed by a PSB as the trace
-continues in the next buffer.
-
-To disable the display of Intel PT packets, combine the -D option with
---no-itrace.
-
-
-perf report
-===========
-
-By default, perf report will decode trace data found in the perf.data file.
-This can be further controlled by new option --itrace exactly the same as
-perf script, with the exception that the default is --itrace=igxe.
-
-
-perf inject
-===========
-
-perf inject also accepts the --itrace option in which case tracing data is
-removed and replaced with the synthesized events. e.g.
-
-	perf inject --itrace -i perf.data -o perf.data.new
-
-Below is an example of using Intel PT with autofdo.  It requires autofdo
-(https://github.com/google/autofdo) and gcc version 5.  The bubble
-sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial)
-amended to take the number of elements as a parameter.
-
-	$ gcc-5 -O3 sort.c -o sort_optimized
-	$ ./sort_optimized 30000
-	Bubble sorting array of 30000 elements
-	2254 ms
-
-	$ cat ~/.perfconfig
-	[intel-pt]
-		mispred-all = on
-
-	$ perf record -e intel_pt//u ./sort 3000
-	Bubble sorting array of 3000 elements
-	58 ms
-	[ perf record: Woken up 2 times to write data ]
-	[ perf record: Captured and wrote 3.939 MB perf.data ]
-	$ perf inject -i perf.data -o inj --itrace=i100usle --strip
-	$ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1
-	$ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo
-	$ ./sort_autofdo 30000
-	Bubble sorting array of 30000 elements
-	2155 ms
-
-Note there is currently no advantage to using Intel PT instead of LBR, but
-that may change in the future if greater use is made of the data.
-
-
-PEBS via Intel PT
-=================
-
-Some hardware has the feature to redirect PEBS records to the Intel PT trace.
-Recording is selected by using the aux-output config term e.g.
-
-	perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname
-
-Note that currently, software only supports redirecting at most one PEBS event.
-
-To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g.
-
-	perf script --itrace=oe
+Documentation for support for Intel Processor Trace within perf tools' has moved to file perf-intel-pt.txt
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index a64d6588470e..70969ea73e01 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -66,4 +66,5 @@ include::itrace.txt[]
 
 SEE ALSO
 --------
-linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1],
+linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
new file mode 100644
index 000000000000..456fdcbf26ac
--- /dev/null
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -0,0 +1,1007 @@
+perf-intel-pt(1)
+================
+
+NAME
+----
+perf-intel-pt - Support for Intel Processor Trace within perf tools
+
+SYNOPSIS
+--------
+[verse]
+'perf record' -e intel_pt//
+
+DESCRIPTION
+-----------
+
+Intel Processor Trace (Intel PT) is an extension of Intel Architecture that
+collects information about software execution such as control flow, execution
+modes and timings and formats it into highly compressed binary packets.
+Technical details are documented in the Intel 64 and IA-32 Architectures
+Software Developer Manuals, Chapter 36 Intel Processor Trace.
+
+Intel PT is first supported in Intel Core M and 5th generation Intel Core
+processors that are based on the Intel micro-architecture code name Broadwell.
+
+Trace data is collected by 'perf record' and stored within the perf.data file.
+See below for options to 'perf record'.
+
+Trace data must be 'decoded' which involves walking the object code and matching
+the trace data packets. For example a TNT packet only tells whether a
+conditional branch was taken or not taken, so to make use of that packet the
+decoder must know precisely which instruction was being executed.
+
+Decoding is done on-the-fly.  The decoder outputs samples in the same format as
+samples output by perf hardware events, for example as though the "instructions"
+or "branches" events had been recorded.  Presently 3 tools support this:
+'perf script', 'perf report' and 'perf inject'.  See below for more information
+on using those tools.
+
+The main distinguishing feature of Intel PT is that the decoder can determine
+the exact flow of software execution.  Intel PT can be used to understand why
+and how did software get to a certain point, or behave a certain way.  The
+software does not have to be recompiled, so Intel PT works with debug or release
+builds, however the executed images are needed - which makes use in JIT-compiled
+environments, or with self-modified code, a challenge.  Also symbols need to be
+provided to make sense of addresses.
+
+A limitation of Intel PT is that it produces huge amounts of trace data
+(hundreds of megabytes per second per core) which takes a long time to decode,
+for example two or three orders of magnitude longer than it took to collect.
+Another limitation is the performance impact of tracing, something that will
+vary depending on the use-case and architecture.
+
+
+Quickstart
+----------
+
+It is important to start small.  That is because it is easy to capture vastly
+more data than can possibly be processed.
+
+The simplest thing to do with Intel PT is userspace profiling of small programs.
+Data is captured with 'perf record' e.g. to trace 'ls' userspace-only:
+
+	perf record -e intel_pt//u ls
+
+And profiled with 'perf report' e.g.
+
+	perf report
+
+To also trace kernel space presents a problem, namely kernel self-modifying
+code.  A fairly good kernel image is available in /proc/kcore but to get an
+accurate image a copy of /proc/kcore needs to be made under the same conditions
+as the data capture.  A script perf-with-kcore can do that, but beware that the
+script makes use of 'sudo' to copy /proc/kcore.  If you have perf installed
+locally from the source tree you can do:
+
+	~/libexec/perf-core/perf-with-kcore record pt_ls -e intel_pt// -- ls
+
+which will create a directory named 'pt_ls' and put the perf.data file and
+copies of /proc/kcore, /proc/kallsyms and /proc/modules into it.  Then to use
+'perf report' becomes:
+
+	~/libexec/perf-core/perf-with-kcore report pt_ls
+
+Because samples are synthesized after-the-fact, the sampling period can be
+selected for reporting. e.g. sample every microsecond
+
+	~/libexec/perf-core/perf-with-kcore report pt_ls --itrace=i1usge
+
+See the sections below for more information about the --itrace option.
+
+Beware the smaller the period, the more samples that are produced, and the
+longer it takes to process them.
+
+Also note that the coarseness of Intel PT timing information will start to
+distort the statistical value of the sampling as the sampling period becomes
+smaller.
+
+To represent software control flow, "branches" samples are produced.  By default
+a branch sample is synthesized for every single branch.  To get an idea what
+data is available you can use the 'perf script' tool with all itrace sampling
+options, which will list all the samples.
+
+	perf record -e intel_pt//u ls
+	perf script --itrace=ibxwpe
+
+An interesting field that is not printed by default is 'flags' which can be
+displayed as follows:
+
+	perf script --itrace=ibxwpe -F+flags
+
+The flags are "bcrosyiABEx" which stand for branch, call, return, conditional,
+system, asynchronous, interrupt, transaction abort, trace begin, trace end, and
+in transaction, respectively.
+
+Another interesting field that is not printed by default is 'ipc' which can be
+displayed as follows:
+
+	perf script --itrace=be -F+ipc
+
+There are two ways that instructions-per-cycle (IPC) can be calculated depending
+on the recording.
+
+If the 'cyc' config term (see config terms section below) was used, then IPC is
+calculated using the cycle count from CYC packets, otherwise MTC packets are
+used - refer to the 'mtc' config term.  When MTC is used, however, the values
+are less accurate because the timing is less accurate.
+
+Because Intel PT does not update the cycle count on every branch or instruction,
+the values will often be zero.  When there are values, they will be the number
+of instructions and number of cycles since the last update, and thus represent
+the average IPC since the last IPC for that event type.  Note IPC for "branches"
+events is calculated separately from IPC for "instructions" events.
+
+Also note that the IPC instruction count may or may not include the current
+instruction.  If the cycle count is associated with an asynchronous branch
+(e.g. page fault or interrupt), then the instruction count does not include the
+current instruction, otherwise it does.  That is consistent with whether or not
+that instruction has retired when the cycle count is updated.
+
+Another note, in the case of "branches" events, non-taken branches are not
+presently sampled, so IPC values for them do not appear e.g. a CYC packet with a
+TNT packet that starts with a non-taken branch.  To see every possible IPC
+value, "instructions" events can be used e.g. --itrace=i0ns
+
+While it is possible to create scripts to analyze the data, an alternative
+approach is available to export the data to a sqlite or postgresql database.
+Refer to script export-to-sqlite.py or export-to-postgresql.py for more details,
+and to script exported-sql-viewer.py for an example of using the database.
+
+There is also script intel-pt-events.py which provides an example of how to
+unpack the raw data for power events and PTWRITE.
+
+As mentioned above, it is easy to capture too much data.  One way to limit the
+data captured is to use 'snapshot' mode which is explained further below.
+Refer to 'new snapshot option' and 'Intel PT modes of operation' further below.
+
+Another problem that will be experienced is decoder errors.  They can be caused
+by inability to access the executed image, self-modified or JIT-ed code, or the
+inability to match side-band information (such as context switches and mmaps)
+which results in the decoder not knowing what code was executed.
+
+There is also the problem of perf not being able to copy the data fast enough,
+resulting in data lost because the buffer was full.  See 'Buffer handling' below
+for more details.
+
+
+perf record
+-----------
+
+new event
+~~~~~~~~~
+
+The Intel PT kernel driver creates a new PMU for Intel PT.  PMU events are
+selected by providing the PMU name followed by the "config" separated by slashes.
+An enhancement has been made to allow default "config" e.g. the option
+
+	-e intel_pt//
+
+will use a default config value.  Currently that is the same as
+
+	-e intel_pt/tsc,noretcomp=0/
+
+which is the same as
+
+	-e intel_pt/tsc=1,noretcomp=0/
+
+Note there are now new config terms - see section 'config terms' further below.
+
+The config terms are listed in /sys/devices/intel_pt/format.  They are bit
+fields within the config member of the struct perf_event_attr which is
+passed to the kernel by the perf_event_open system call.  They correspond to bit
+fields in the IA32_RTIT_CTL MSR.  Here is a list of them and their definitions:
+
+	$ grep -H . /sys/bus/event_source/devices/intel_pt/format/*
+	/sys/bus/event_source/devices/intel_pt/format/cyc:config:1
+	/sys/bus/event_source/devices/intel_pt/format/cyc_thresh:config:19-22
+	/sys/bus/event_source/devices/intel_pt/format/mtc:config:9
+	/sys/bus/event_source/devices/intel_pt/format/mtc_period:config:14-17
+	/sys/bus/event_source/devices/intel_pt/format/noretcomp:config:11
+	/sys/bus/event_source/devices/intel_pt/format/psb_period:config:24-27
+	/sys/bus/event_source/devices/intel_pt/format/tsc:config:10
+
+Note that the default config must be overridden for each term i.e.
+
+	-e intel_pt/noretcomp=0/
+
+is the same as:
+
+	-e intel_pt/tsc=1,noretcomp=0/
+
+So, to disable TSC packets use:
+
+	-e intel_pt/tsc=0/
+
+It is also possible to specify the config value explicitly:
+
+	-e intel_pt/config=0x400/
+
+Note that, as with all events, the event is suffixed with event modifiers:
+
+	u	userspace
+	k	kernel
+	h	hypervisor
+	G	guest
+	H	host
+	p	precise ip
+
+'h', 'G' and 'H' are for virtualization which is not supported by Intel PT.
+'p' is also not relevant to Intel PT.  So only options 'u' and 'k' are
+meaningful for Intel PT.
+
+perf_event_attr is displayed if the -vv option is used e.g.
+
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             6
+	size                             112
+	config                           0x400
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|CPU|IDENTIFIER
+	read_format                      ID
+	disabled                         1
+	inherit                          1
+	exclude_kernel                   1
+	exclude_hv                       1
+	enable_on_exec                   1
+	sample_id_all                    1
+	------------------------------------------------------------
+	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
+	------------------------------------------------------------
+
+
+config terms
+~~~~~~~~~~~~
+
+The June 2015 version of Intel 64 and IA-32 Architectures Software Developer
+Manuals, Chapter 36 Intel Processor Trace, defined new Intel PT features.
+Some of the features are reflect in new config terms.  All the config terms are
+described below.
+
+tsc		Always supported.  Produces TSC timestamp packets to provide
+		timing information.  In some cases it is possible to decode
+		without timing information, for example a per-thread context
+		that does not overlap executable memory maps.
+
+		The default config selects tsc (i.e. tsc=1).
+
+noretcomp	Always supported.  Disables "return compression" so a TIP packet
+		is produced when a function returns.  Causes more packets to be
+		produced but might make decoding more reliable.
+
+		The default config does not select noretcomp (i.e. noretcomp=0).
+
+psb_period	Allows the frequency of PSB packets to be specified.
+
+		The PSB packet is a synchronization packet that provides a
+		starting point for decoding or recovery from errors.
+
+		Support for psb_period is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/psb_cyc
+
+		which contains "1" if the feature is supported and "0"
+		otherwise.
+
+		Valid values are given by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/psb_periods
+
+		which contains a hexadecimal value, the bits of which represent
+		valid values e.g. bit 2 set means value 2 is valid.
+
+		The psb_period value is converted to the approximate number of
+		trace bytes between PSB packets as:
+
+			2 ^ (value + 11)
+
+		e.g. value 3 means 16KiB bytes between PSBs
+
+		If an invalid value is entered, the error message
+		will give a list of valid values e.g.
+
+			$ perf record -e intel_pt/psb_period=15/u uname
+			Invalid psb_period for intel_pt. Valid values are: 0-5
+
+		If MTC packets are selected, the default config selects a value
+		of 3 (i.e. psb_period=3) or the nearest lower value that is
+		supported (0 is always supported).  Otherwise the default is 0.
+
+		If decoding is expected to be reliable and the buffer is large
+		then a large PSB period can be used.
+
+		Because a TSC packet is produced with PSB, the PSB period can
+		also affect the granularity to timing information in the absence
+		of MTC or CYC.
+
+mtc		Produces MTC timing packets.
+
+		MTC packets provide finer grain timestamp information than TSC
+		packets.  MTC packets record time using the hardware crystal
+		clock (CTC) which is related to TSC packets using a TMA packet.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/mtc
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+		The frequency of MTC packets can also be specified - see
+		mtc_period below.
+
+mtc_period	Specifies how frequently MTC packets are produced - see mtc
+		above for how to determine if MTC packets are supported.
+
+		Valid values are given by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/mtc_periods
+
+		which contains a hexadecimal value, the bits of which represent
+		valid values e.g. bit 2 set means value 2 is valid.
+
+		The mtc_period value is converted to the MTC frequency as:
+
+			CTC-frequency / (2 ^ value)
+
+		e.g. value 3 means one eighth of CTC-frequency
+
+		Where CTC is the hardware crystal clock, the frequency of which
+		can be related to TSC via values provided in cpuid leaf 0x15.
+
+		If an invalid value is entered, the error message
+		will give a list of valid values e.g.
+
+			$ perf record -e intel_pt/mtc_period=15/u uname
+			Invalid mtc_period for intel_pt. Valid values are: 0,3,6,9
+
+		The default value is 3 or the nearest lower value
+		that is supported (0 is always supported).
+
+cyc		Produces CYC timing packets.
+
+		CYC packets provide even finer grain timestamp information than
+		MTC and TSC packets.  A CYC packet contains the number of CPU
+		cycles since the last CYC packet. Unlike MTC and TSC packets,
+		CYC packets are only sent when another packet is also sent.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/psb_cyc
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+		The number of CYC packets produced can be reduced by specifying
+		a threshold - see cyc_thresh below.
+
+cyc_thresh	Specifies how frequently CYC packets are produced - see cyc
+		above for how to determine if CYC packets are supported.
+
+		Valid cyc_thresh values are given by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/cycle_thresholds
+
+		which contains a hexadecimal value, the bits of which represent
+		valid values e.g. bit 2 set means value 2 is valid.
+
+		The cyc_thresh value represents the minimum number of CPU cycles
+		that must have passed before a CYC packet can be sent.  The
+		number of CPU cycles is:
+
+			2 ^ (value - 1)
+
+		e.g. value 4 means 8 CPU cycles must pass before a CYC packet
+		can be sent.  Note a CYC packet is still only sent when another
+		packet is sent, not at, e.g. every 8 CPU cycles.
+
+		If an invalid value is entered, the error message
+		will give a list of valid values e.g.
+
+			$ perf record -e intel_pt/cyc,cyc_thresh=15/u uname
+			Invalid cyc_thresh for intel_pt. Valid values are: 0-12
+
+		CYC packets are not requested by default.
+
+pt		Specifies pass-through which enables the 'branch' config term.
+
+		The default config selects 'pt' if it is available, so a user will
+		never need to specify this term.
+
+branch		Enable branch tracing.  Branch tracing is enabled by default so to
+		disable branch tracing use 'branch=0'.
+
+		The default config selects 'branch' if it is available.
+
+ptw		Enable PTWRITE packets which are produced when a ptwrite instruction
+		is executed.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/ptwrite
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+fup_on_ptw	Enable a FUP packet to follow the PTWRITE packet.  The FUP packet
+		provides the address of the ptwrite instruction.  In the absence of
+		fup_on_ptw, the decoder will use the address of the previous branch
+		if branch tracing is enabled, otherwise the address will be zero.
+		Note that fup_on_ptw will work even when branch tracing is disabled.
+
+pwr_evt		Enable power events.  The power events provide information about
+		changes to the CPU C-state.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/power_event_trace
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+
+AUX area sampling option
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To select Intel PT "sampling" the AUX area sampling option can be used:
+
+	--aux-sample
+
+Optionally it can be followed by the sample size in bytes e.g.
+
+	--aux-sample=8192
+
+In addition, the Intel PT event to sample must be defined e.g.
+
+	-e intel_pt//u
+
+Samples on other events will be created containing Intel PT data e.g. the
+following will create Intel PT samples on the branch-misses event, note the
+events must be grouped using {}:
+
+	perf record --aux-sample -e '{intel_pt//u,branch-misses:u}'
+
+An alternative to '--aux-sample' is to add the config term 'aux-sample-size' to
+events.  In this case, the grouping is implied e.g.
+
+	perf record -e intel_pt//u -e branch-misses/aux-sample-size=8192/u
+
+is the same as:
+
+	perf record -e '{intel_pt//u,branch-misses/aux-sample-size=8192/u}'
+
+but allows for also using an address filter e.g.:
+
+	perf record -e intel_pt//u --filter 'filter * @/bin/ls' -e branch-misses/aux-sample-size=8192/u -- ls
+
+It is important to select a sample size that is big enough to contain at least
+one PSB packet.  If not a warning will be displayed:
+
+	Intel PT sample size (%zu) may be too small for PSB period (%zu)
+
+The calculation used for that is: if sample_size <= psb_period + 256 display the
+warning.  When sampling is used, psb_period defaults to 0 (2KiB).
+
+The default sample size is 4KiB.
+
+The sample size is passed in aux_sample_size in struct perf_event_attr.  The
+sample size is limited by the maximum event size which is 64KiB.  It is
+difficult to know how big the event might be without the trace sample attached,
+but the tool validates that the sample size is not greater than 60KiB.
+
+
+new snapshot option
+~~~~~~~~~~~~~~~~~~~
+
+The difference between full trace and snapshot from the kernel's perspective is
+that in full trace we don't overwrite trace data that the user hasn't collected
+yet (and indicated that by advancing aux_tail), whereas in snapshot mode we let
+the trace run and overwrite older data in the buffer so that whenever something
+interesting happens, we can stop it and grab a snapshot of what was going on
+around that interesting moment.
+
+To select snapshot mode a new option has been added:
+
+	-S
+
+Optionally it can be followed by the snapshot size e.g.
+
+	-S0x100000
+
+The default snapshot size is the auxtrace mmap size.  If neither auxtrace mmap size
+nor snapshot size is specified, then the default is 4MiB for privileged users
+(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users.
+If an unprivileged user does not specify mmap pages, the mmap pages will be
+reduced as described in the 'new auxtrace mmap size option' section below.
+
+The snapshot size is displayed if the option -vv is used e.g.
+
+	Intel PT snapshot size: %zu
+
+
+new auxtrace mmap size option
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Intel PT buffer size is specified by an addition to the -m option e.g.
+
+	-m,16
+
+selects a buffer size of 16 pages i.e. 64KiB.
+
+Note that the existing functionality of -m is unchanged.  The auxtrace mmap size
+is specified by the optional addition of a comma and the value.
+
+The default auxtrace mmap size for Intel PT is 4MiB/page_size for privileged users
+(or if /proc/sys/kernel/perf_event_paranoid < 0), 128KiB for unprivileged users.
+If an unprivileged user does not specify mmap pages, the mmap pages will be
+reduced from the default 512KiB/page_size to 256KiB/page_size, otherwise the
+user is likely to get an error as they exceed their mlock limit (Max locked
+memory as shown in /proc/self/limits).  Note that perf does not count the first
+512KiB (actually /proc/sys/kernel/perf_event_mlock_kb minus 1 page) per cpu
+against the mlock limit so an unprivileged user is allowed 512KiB per cpu plus
+their mlock limit (which defaults to 64KiB but is not multiplied by the number
+of cpus).
+
+In full-trace mode, powers of two are allowed for buffer size, with a minimum
+size of 2 pages.  In snapshot mode or sampling mode, it is the same but the
+minimum size is 1 page.
+
+The mmap size and auxtrace mmap size are displayed if the -vv option is used e.g.
+
+	mmap length 528384
+	auxtrace mmap length 4198400
+
+
+Intel PT modes of operation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Intel PT can be used in 2 modes:
+	full-trace mode
+	sample mode
+	snapshot mode
+
+Full-trace mode traces continuously e.g.
+
+	perf record -e intel_pt//u uname
+
+Sample mode attaches a Intel PT sample to other events e.g.
+
+	perf record --aux-sample -e intel_pt//u -e branch-misses:u
+
+Snapshot mode captures the available data when a signal is sent e.g.
+
+	perf record -v -e intel_pt//u -S ./loopy 1000000000 &
+	[1] 11435
+	kill -USR2 11435
+	Recording AUX area tracing snapshot
+
+Note that the signal sent is SIGUSR2.
+Note that "Recording AUX area tracing snapshot" is displayed because the -v
+option is used.
+
+The 2 modes cannot be used together.
+
+
+Buffer handling
+~~~~~~~~~~~~~~~
+
+There may be buffer limitations (i.e. single ToPa entry) which means that actual
+buffer sizes are limited to powers of 2 up to 4MiB (MAX_ORDER).  In order to
+provide other sizes, and in particular an arbitrarily large size, multiple
+buffers are logically concatenated.  However an interrupt must be used to switch
+between buffers.  That has two potential problems:
+	a) the interrupt may not be handled in time so that the current buffer
+	becomes full and some trace data is lost.
+	b) the interrupts may slow the system and affect the performance
+	results.
+
+If trace data is lost, the driver sets 'truncated' in the PERF_RECORD_AUX event
+which the tools report as an error.
+
+In full-trace mode, the driver waits for data to be copied out before allowing
+the (logical) buffer to wrap-around.  If data is not copied out quickly enough,
+again 'truncated' is set in the PERF_RECORD_AUX event.  If the driver has to
+wait, the intel_pt event gets disabled.  Because it is difficult to know when
+that happens, perf tools always re-enable the intel_pt event after copying out
+data.
+
+
+Intel PT and build ids
+~~~~~~~~~~~~~~~~~~~~~~
+
+By default "perf record" post-processes the event stream to find all build ids
+for executables for all addresses sampled.  Deliberately, Intel PT is not
+decoded for that purpose (it would take too long).  Instead the build ids for
+all executables encountered (due to mmap, comm or task events) are included
+in the perf.data file.
+
+To see buildids included in the perf.data file use the command:
+
+	perf buildid-list
+
+If the perf.data file contains Intel PT data, that is the same as:
+
+	perf buildid-list --with-hits
+
+
+Snapshot mode and event disabling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to make a snapshot, the intel_pt event is disabled using an IOCTL,
+namely PERF_EVENT_IOC_DISABLE.  However doing that can also disable the
+collection of side-band information.  In order to prevent that,  a dummy
+software event has been introduced that permits tracking events (like mmaps) to
+continue to be recorded while intel_pt is disabled.  That is important to ensure
+there is complete side-band information to allow the decoding of subsequent
+snapshots.
+
+A test has been created for that.  To find the test:
+
+	perf test list
+	...
+	23: Test using a dummy software event to keep tracking
+
+To run the test:
+
+	perf test 23
+	23: Test using a dummy software event to keep tracking     : Ok
+
+
+perf record modes (nothing new here)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+perf record essentially operates in one of three modes:
+	per thread
+	per cpu
+	workload only
+
+"per thread" mode is selected by -t or by --per-thread (with -p or -u or just a
+workload).
+"per cpu" is selected by -C or -a.
+"workload only" mode is selected by not using the other options but providing a
+command to run (i.e. the workload).
+
+In per-thread mode an exact list of threads is traced.  There is no inheritance.
+Each thread has its own event buffer.
+
+In per-cpu mode all processes (or processes from the selected cgroup i.e. -G
+option, or processes selected with -p or -u) are traced.  Each cpu has its own
+buffer. Inheritance is allowed.
+
+In workload-only mode, the workload is traced but with per-cpu buffers.
+Inheritance is allowed.  Note that you can now trace a workload in per-thread
+mode by using the --per-thread option.
+
+
+Privileged vs non-privileged users
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users
+have memory limits imposed upon them.  That affects what buffer sizes they can
+have as outlined above.
+
+The v4.2 kernel introduced support for a context switch metadata event,
+PERF_RECORD_SWITCH, which allows unprivileged users to see when their processes
+are scheduled out and in, just not by whom, which is left for the
+PERF_RECORD_SWITCH_CPU_WIDE, that is only accessible in system wide context,
+which in turn requires CAP_SYS_ADMIN.
+
+Please see the 45ac1403f564 ("perf: Add PERF_RECORD_SWITCH to indicate context
+switches") commit, that introduces these metadata events for further info.
+
+When working with kernels < v4.2, the following considerations must be taken,
+as the sched:sched_switch tracepoints will be used to receive such information:
+
+Unless /proc/sys/kernel/perf_event_paranoid is set to -1, unprivileged users are
+not permitted to use tracepoints which means there is insufficient side-band
+information to decode Intel PT in per-cpu mode, and potentially workload-only
+mode too if the workload creates new processes.
+
+Note also, that to use tracepoints, read-access to debugfs is required.  So if
+debugfs is not mounted or the user does not have read-access, it will again not
+be possible to decode Intel PT in per-cpu mode.
+
+
+sched_switch tracepoint
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The sched_switch tracepoint is used to provide side-band data for Intel PT
+decoding in kernels where the PERF_RECORD_SWITCH metadata event isn't
+available.
+
+The sched_switch events are automatically added. e.g. the second event shown
+below:
+
+	$ perf record -vv -e intel_pt//u uname
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             6
+	size                             112
+	config                           0x400
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|CPU|IDENTIFIER
+	read_format                      ID
+	disabled                         1
+	inherit                          1
+	exclude_kernel                   1
+	exclude_hv                       1
+	enable_on_exec                   1
+	sample_id_all                    1
+	------------------------------------------------------------
+	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             2
+	size                             112
+	config                           0x108
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|CPU|PERIOD|RAW|IDENTIFIER
+	read_format                      ID
+	inherit                          1
+	sample_id_all                    1
+	exclude_guest                    1
+	------------------------------------------------------------
+	sys_perf_event_open: pid -1  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid -1  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid -1  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid -1  cpu 3  group_fd -1  flags 0x8
+	------------------------------------------------------------
+	perf_event_attr:
+	type                             1
+	size                             112
+	config                           0x9
+	{ sample_period, sample_freq }   1
+	sample_type                      IP|TID|TIME|IDENTIFIER
+	read_format                      ID
+	disabled                         1
+	inherit                          1
+	exclude_kernel                   1
+	exclude_hv                       1
+	mmap                             1
+	comm                             1
+	enable_on_exec                   1
+	task                             1
+	sample_id_all                    1
+	mmap2                            1
+	comm_exec                        1
+	------------------------------------------------------------
+	sys_perf_event_open: pid 31104  cpu 0  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 1  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 2  group_fd -1  flags 0x8
+	sys_perf_event_open: pid 31104  cpu 3  group_fd -1  flags 0x8
+	mmap size 528384B
+	AUX area mmap length 4194304
+	perf event ring buffer mmapped per cpu
+	Synthesizing auxtrace information
+	Linux
+	[ perf record: Woken up 1 times to write data ]
+	[ perf record: Captured and wrote 0.042 MB perf.data ]
+
+Note, the sched_switch event is only added if the user is permitted to use it
+and only in per-cpu mode.
+
+Note also, the sched_switch event is only added if TSC packets are requested.
+That is because, in the absence of timing information, the sched_switch events
+cannot be matched against the Intel PT trace.
+
+
+perf script
+-----------
+
+By default, perf script will decode trace data found in the perf.data file.
+This can be further controlled by new option --itrace.
+
+
+New --itrace option
+~~~~~~~~~~~~~~~~~~~
+
+Having no option is the same as
+
+	--itrace
+
+which, in turn, is the same as
+
+	--itrace=cepwx
+
+The letters are:
+
+	i	synthesize "instructions" events
+	b	synthesize "branches" events
+	x	synthesize "transactions" events
+	w	synthesize "ptwrite" events
+	p	synthesize "power" events
+	c	synthesize branches events (calls only)
+	r	synthesize branches events (returns only)
+	e	synthesize tracing error events
+	d	create a debug log
+	g	synthesize a call chain (use with i or x)
+	l	synthesize last branch entries (use with i or x)
+	s	skip initial number of events
+
+"Instructions" events look like they were recorded by "perf record -e
+instructions".
+
+"Branches" events look like they were recorded by "perf record -e branches". "c"
+and "r" can be combined to get calls and returns.
+
+"Transactions" events correspond to the start or end of transactions. The
+'flags' field can be used in perf script to determine whether the event is a
+tranasaction start, commit or abort.
+
+Note that "instructions", "branches" and "transactions" events depend on code
+flow packets which can be disabled by using the config term "branch=0".  Refer
+to the config terms section above.
+
+"ptwrite" events record the payload of the ptwrite instruction and whether
+"fup_on_ptw" was used.  "ptwrite" events depend on PTWRITE packets which are
+recorded only if the "ptw" config term was used.  Refer to the config terms
+section above.  perf script "synth" field displays "ptwrite" information like
+this: "ip: 0 payload: 0x123456789abcdef0"  where "ip" is 1 if "fup_on_ptw" was
+used.
+
+"Power" events correspond to power event packets and CBR (core-to-bus ratio)
+packets.  While CBR packets are always recorded when tracing is enabled, power
+event packets are recorded only if the "pwr_evt" config term was used.  Refer to
+the config terms section above.  The power events record information about
+C-state changes, whereas CBR is indicative of CPU frequency.  perf script
+"event,synth" fields display information like this:
+	cbr:  cbr: 22 freq: 2189 MHz (200%)
+	mwait:  hints: 0x60 extensions: 0x1
+	pwre:  hw: 0 cstate: 2 sub-cstate: 0
+	exstop:  ip: 1
+	pwrx:  deepest cstate: 2 last cstate: 2 wake reason: 0x4
+Where:
+	"cbr" includes the frequency and the percentage of maximum non-turbo
+	"mwait" shows mwait hints and extensions
+	"pwre" shows C-state transitions (to a C-state deeper than C0) and
+	whether	initiated by hardware
+	"exstop" indicates execution stopped and whether the IP was recorded
+	exactly,
+	"pwrx" indicates return to C0
+For more details refer to the Intel 64 and IA-32 Architectures Software
+Developer Manuals.
+
+Error events show where the decoder lost the trace.  Error events
+are quite important.  Users must know if what they are seeing is a complete
+picture or not.
+
+The "d" option will cause the creation of a file "intel_pt.log" containing all
+decoded packets and instructions.  Note that this option slows down the decoder
+and that the resulting file may be very large.
+
+In addition, the period of the "instructions" event can be specified. e.g.
+
+	--itrace=i10us
+
+sets the period to 10us i.e. one  instruction sample is synthesized for each 10
+microseconds of trace.  Alternatives to "us" are "ms" (milliseconds),
+"ns" (nanoseconds), "t" (TSC ticks) or "i" (instructions).
+
+"ms", "us" and "ns" are converted to TSC ticks.
+
+The timing information included with Intel PT does not give the time of every
+instruction.  Consequently, for the purpose of sampling, the decoder estimates
+the time since the last timing packet based on 1 tick per instruction.  The time
+on the sample is *not* adjusted and reflects the last known value of TSC.
+
+For Intel PT, the default period is 100us.
+
+Setting it to a zero period means "as often as possible".
+
+In the case of Intel PT that is the same as a period of 1 and a unit of
+'instructions' (i.e. --itrace=i1i).
+
+Also the call chain size (default 16, max. 1024) for instructions or
+transactions events can be specified. e.g.
+
+	--itrace=ig32
+	--itrace=xg32
+
+Also the number of last branch entries (default 64, max. 1024) for instructions or
+transactions events can be specified. e.g.
+
+       --itrace=il10
+       --itrace=xl10
+
+Note that last branch entries are cleared for each sample, so there is no overlap
+from one sample to the next.
+
+To disable trace decoding entirely, use the option --no-itrace.
+
+It is also possible to skip events generated (instructions, branches, transactions)
+at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+skips the first million instructions.
+
+dump option
+~~~~~~~~~~~
+
+perf script has an option (-D) to "dump" the events i.e. display the binary
+data.
+
+When -D is used, Intel PT packets are displayed.  The packet decoder does not
+pay attention to PSB packets, but just decodes the bytes - so the packets seen
+by the actual decoder may not be identical in places where the data is corrupt.
+One example of that would be when the buffer-switching interrupt has been too
+slow, and the buffer has been filled completely.  In that case, the last packet
+in the buffer might be truncated and immediately followed by a PSB as the trace
+continues in the next buffer.
+
+To disable the display of Intel PT packets, combine the -D option with
+--no-itrace.
+
+
+perf report
+-----------
+
+By default, perf report will decode trace data found in the perf.data file.
+This can be further controlled by new option --itrace exactly the same as
+perf script, with the exception that the default is --itrace=igxe.
+
+
+perf inject
+-----------
+
+perf inject also accepts the --itrace option in which case tracing data is
+removed and replaced with the synthesized events. e.g.
+
+	perf inject --itrace -i perf.data -o perf.data.new
+
+Below is an example of using Intel PT with autofdo.  It requires autofdo
+(https://github.com/google/autofdo) and gcc version 5.  The bubble
+sort example is from the AutoFDO tutorial (https://gcc.gnu.org/wiki/AutoFDO/Tutorial)
+amended to take the number of elements as a parameter.
+
+	$ gcc-5 -O3 sort.c -o sort_optimized
+	$ ./sort_optimized 30000
+	Bubble sorting array of 30000 elements
+	2254 ms
+
+	$ cat ~/.perfconfig
+	[intel-pt]
+		mispred-all = on
+
+	$ perf record -e intel_pt//u ./sort 3000
+	Bubble sorting array of 3000 elements
+	58 ms
+	[ perf record: Woken up 2 times to write data ]
+	[ perf record: Captured and wrote 3.939 MB perf.data ]
+	$ perf inject -i perf.data -o inj --itrace=i100usle --strip
+	$ ./create_gcov --binary=./sort --profile=inj --gcov=sort.gcov -gcov_version=1
+	$ gcc-5 -O3 -fauto-profile=sort.gcov sort.c -o sort_autofdo
+	$ ./sort_autofdo 30000
+	Bubble sorting array of 30000 elements
+	2155 ms
+
+Note there is currently no advantage to using Intel PT instead of LBR, but
+that may change in the future if greater use is made of the data.
+
+
+PEBS via Intel PT
+-----------------
+
+Some hardware has the feature to redirect PEBS records to the Intel PT trace.
+Recording is selected by using the aux-output config term e.g.
+
+	perf record -c 10000 -e '{intel_pt/branch=0/,cycles/aux-output/ppp}' uname
+
+Note that currently, software only supports redirecting at most one PEBS event.
+
+To display PEBS events from the Intel PT trace, use the itrace 'o' option e.g.
+
+	perf script --itrace=oe
+
+
+SEE ALSO
+--------
+
+linkperf:perf-record[1], linkperf:perf-script[1], linkperf:perf-report[1],
+linkperf:perf-inject[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index b23a4012a606..7f4db7592467 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -589,4 +589,4 @@ appended unit character - B/K/M/G
 
 SEE ALSO
 --------
-linkperf:perf-stat[1], linkperf:perf-list[1]
+linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index db61f16ffa56..bd0a029d4c08 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -546,4 +546,5 @@ include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
 --------
-linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1]
+linkperf:perf-stat[1], linkperf:perf-annotate[1], linkperf:perf-record[1],
+linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 2599b057e47b..db6a36aac47e 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -429,4 +429,4 @@ include::itrace.txt[]
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
-linkperf:perf-script-python[1]
+linkperf:perf-script-python[1], linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 9431b8066fb4..4d56586b2fb9 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -334,6 +334,15 @@ Configure all used events to run in kernel space.
 --all-user::
 Configure all used events to run in user space.
 
+--percore-show-thread::
+The event modifier "percore" has supported to sum up the event counts
+for all hardware threads in a core and show the counts per core.
+
+This option with event modifier "percore" enabled also sums up the event
+counts for all hardware threads in a core but show the sum counts per
+hardware thread. This is essentially a replacement for the any bit and
+convenient for post processing.
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index c03c36fde7e2..5e697cd2224a 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -572,29 +572,12 @@ static void init_block_hist(struct block_hist *bh)
 	bh->valid = true;
 }
 
-static int block_pair_cmp(struct hist_entry *a, struct hist_entry *b)
-{
-	struct block_info *bi_a = a->block_info;
-	struct block_info *bi_b = b->block_info;
-	int cmp;
-
-	if (!bi_a->sym || !bi_b->sym)
-		return -1;
-
-	cmp = strcmp(bi_a->sym->name, bi_b->sym->name);
-
-	if ((!cmp) && (bi_a->start == bi_b->start) && (bi_a->end == bi_b->end))
-		return 0;
-
-	return -1;
-}
-
 static struct hist_entry *get_block_pair(struct hist_entry *he,
 					 struct hists *hists_pair)
 {
 	struct rb_root_cached *root = hists_pair->entries_in;
 	struct rb_node *next = rb_first_cached(root);
-	int cmp;
+	int64_t cmp;
 
 	while (next != NULL) {
 		struct hist_entry *he_pair = rb_entry(next, struct hist_entry,
@@ -602,7 +585,7 @@ static struct hist_entry *get_block_pair(struct hist_entry *he,
 
 		next = rb_next(&he_pair->rb_node_in);
 
-		cmp = block_pair_cmp(he_pair, he);
+		cmp = __block_info__cmp(he_pair, he);
 		if (!cmp)
 			return he_pair;
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 72a12b69f120..5f4045df76f4 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -104,6 +104,7 @@ struct report {
 	bool			symbol_ipc;
 	bool			total_cycles_mode;
 	struct block_report	*block_reports;
+	int			nr_block_reports;
 };
 
 static int report__config(const char *var, const char *value, void *cb)
@@ -185,24 +186,23 @@ static int hist_iter__branch_callback(struct hist_entry_iter *iter,
 {
 	struct hist_entry *he = iter->he;
 	struct report *rep = arg;
-	struct branch_info *bi;
+	struct branch_info *bi = he->branch_info;
 	struct perf_sample *sample = iter->sample;
 	struct evsel *evsel = iter->evsel;
 	int err;
 
+	branch_type_count(&rep->brtype_stat, &bi->flags,
+			  bi->from.addr, bi->to.addr);
+
 	if (!ui__has_annotation() && !rep->symbol_ipc)
 		return 0;
 
-	bi = he->branch_info;
 	err = addr_map_symbol__inc_samples(&bi->from, sample, evsel);
 	if (err)
 		goto out;
 
 	err = addr_map_symbol__inc_samples(&bi->to, sample, evsel);
 
-	branch_type_count(&rep->brtype_stat, &bi->flags,
-			  bi->from.addr, bi->to.addr);
-
 out:
 	return err;
 }
@@ -966,8 +966,19 @@ static int __cmd_report(struct report *rep)
 	report__output_resort(rep);
 
 	if (rep->total_cycles_mode) {
+		int block_hpps[6] = {
+			PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT,
+			PERF_HPP_REPORT__BLOCK_LBR_CYCLES,
+			PERF_HPP_REPORT__BLOCK_CYCLES_PCT,
+			PERF_HPP_REPORT__BLOCK_AVG_CYCLES,
+			PERF_HPP_REPORT__BLOCK_RANGE,
+			PERF_HPP_REPORT__BLOCK_DSO,
+		};
+
 		rep->block_reports = block_info__create_report(session->evlist,
-							       rep->total_cycles);
+							       rep->total_cycles,
+							       block_hpps, 6,
+							       &rep->nr_block_reports);
 		if (!rep->block_reports)
 			return -1;
 	}
@@ -1551,8 +1562,11 @@ error:
 		zfree(&report.ptime_range);
 	}
 
-	if (report.block_reports)
-		zfree(&report.block_reports);
+	if (report.block_reports) {
+		block_info__free_report(report.block_reports,
+					report.nr_block_reports);
+		report.block_reports = NULL;
+	}
 
 	zstd_fini(&(session->zstd_data));
 	perf_session__delete(session);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index e2406b291c1c..656b347f6dd8 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -735,6 +735,7 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample,
 					struct perf_event_attr *attr, FILE *fp)
 {
 	struct branch_stack *br = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	struct addr_location alf, alt;
 	u64 i, from, to;
 	int printed = 0;
@@ -743,8 +744,8 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample,
 		return 0;
 
 	for (i = 0; i < br->nr; i++) {
-		from = br->entries[i].from;
-		to   = br->entries[i].to;
+		from = entries[i].from;
+		to   = entries[i].to;
 
 		if (PRINT_FIELD(DSO)) {
 			memset(&alf, 0, sizeof(alf));
@@ -768,10 +769,10 @@ static int perf_sample__fprintf_brstack(struct perf_sample *sample,
 		}
 
 		printed += fprintf(fp, "/%c/%c/%c/%d ",
-			mispred_str( br->entries + i),
-			br->entries[i].flags.in_tx? 'X' : '-',
-			br->entries[i].flags.abort? 'A' : '-',
-			br->entries[i].flags.cycles);
+			mispred_str(entries + i),
+			entries[i].flags.in_tx ? 'X' : '-',
+			entries[i].flags.abort ? 'A' : '-',
+			entries[i].flags.cycles);
 	}
 
 	return printed;
@@ -782,6 +783,7 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample,
 					   struct perf_event_attr *attr, FILE *fp)
 {
 	struct branch_stack *br = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	struct addr_location alf, alt;
 	u64 i, from, to;
 	int printed = 0;
@@ -793,8 +795,8 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample,
 
 		memset(&alf, 0, sizeof(alf));
 		memset(&alt, 0, sizeof(alt));
-		from = br->entries[i].from;
-		to   = br->entries[i].to;
+		from = entries[i].from;
+		to   = entries[i].to;
 
 		thread__find_symbol_fb(thread, sample->cpumode, from, &alf);
 		thread__find_symbol_fb(thread, sample->cpumode, to, &alt);
@@ -813,10 +815,10 @@ static int perf_sample__fprintf_brstacksym(struct perf_sample *sample,
 			printed += fprintf(fp, ")");
 		}
 		printed += fprintf(fp, "/%c/%c/%c/%d ",
-			mispred_str( br->entries + i),
-			br->entries[i].flags.in_tx? 'X' : '-',
-			br->entries[i].flags.abort? 'A' : '-',
-			br->entries[i].flags.cycles);
+			mispred_str(entries + i),
+			entries[i].flags.in_tx ? 'X' : '-',
+			entries[i].flags.abort ? 'A' : '-',
+			entries[i].flags.cycles);
 	}
 
 	return printed;
@@ -827,6 +829,7 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample,
 					   struct perf_event_attr *attr, FILE *fp)
 {
 	struct branch_stack *br = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	struct addr_location alf, alt;
 	u64 i, from, to;
 	int printed = 0;
@@ -838,8 +841,8 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample,
 
 		memset(&alf, 0, sizeof(alf));
 		memset(&alt, 0, sizeof(alt));
-		from = br->entries[i].from;
-		to   = br->entries[i].to;
+		from = entries[i].from;
+		to   = entries[i].to;
 
 		if (thread__find_map_fb(thread, sample->cpumode, from, &alf) &&
 		    !alf.map->dso->adjust_symbols)
@@ -862,10 +865,10 @@ static int perf_sample__fprintf_brstackoff(struct perf_sample *sample,
 			printed += fprintf(fp, ")");
 		}
 		printed += fprintf(fp, "/%c/%c/%c/%d ",
-			mispred_str(br->entries + i),
-			br->entries[i].flags.in_tx ? 'X' : '-',
-			br->entries[i].flags.abort ? 'A' : '-',
-			br->entries[i].flags.cycles);
+			mispred_str(entries + i),
+			entries[i].flags.in_tx ? 'X' : '-',
+			entries[i].flags.abort ? 'A' : '-',
+			entries[i].flags.cycles);
 	}
 
 	return printed;
@@ -1053,6 +1056,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 					    struct machine *machine, FILE *fp)
 {
 	struct branch_stack *br = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	u64 start, end;
 	int i, insn, len, nr, ilen, printed = 0;
 	struct perf_insn x;
@@ -1073,31 +1077,31 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 	printed += fprintf(fp, "%c", '\n');
 
 	/* Handle first from jump, of which we don't know the entry. */
-	len = grab_bb(buffer, br->entries[nr-1].from,
-			br->entries[nr-1].from,
+	len = grab_bb(buffer, entries[nr-1].from,
+			entries[nr-1].from,
 			machine, thread, &x.is64bit, &x.cpumode, false);
 	if (len > 0) {
-		printed += ip__fprintf_sym(br->entries[nr - 1].from, thread,
+		printed += ip__fprintf_sym(entries[nr - 1].from, thread,
 					   x.cpumode, x.cpu, &lastsym, attr, fp);
-		printed += ip__fprintf_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
+		printed += ip__fprintf_jump(entries[nr - 1].from, &entries[nr - 1],
 					    &x, buffer, len, 0, fp, &total_cycles);
 		if (PRINT_FIELD(SRCCODE))
-			printed += print_srccode(thread, x.cpumode, br->entries[nr - 1].from);
+			printed += print_srccode(thread, x.cpumode, entries[nr - 1].from);
 	}
 
 	/* Print all blocks */
 	for (i = nr - 2; i >= 0; i--) {
-		if (br->entries[i].from || br->entries[i].to)
+		if (entries[i].from || entries[i].to)
 			pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
-				 br->entries[i].from,
-				 br->entries[i].to);
-		start = br->entries[i + 1].to;
-		end   = br->entries[i].from;
+				 entries[i].from,
+				 entries[i].to);
+		start = entries[i + 1].to;
+		end   = entries[i].from;
 
 		len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
 		/* Patch up missing kernel transfers due to ring filters */
 		if (len == -ENXIO && i > 0) {
-			end = br->entries[--i].from;
+			end = entries[--i].from;
 			pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end);
 			len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
 		}
@@ -1110,7 +1114,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 
 			printed += ip__fprintf_sym(ip, thread, x.cpumode, x.cpu, &lastsym, attr, fp);
 			if (ip == end) {
-				printed += ip__fprintf_jump(ip, &br->entries[i], &x, buffer + off, len - off, ++insn, fp,
+				printed += ip__fprintf_jump(ip, &entries[i], &x, buffer + off, len - off, ++insn, fp,
 							    &total_cycles);
 				if (PRINT_FIELD(SRCCODE))
 					printed += print_srccode(thread, x.cpumode, ip);
@@ -1134,9 +1138,9 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 	 * Hit the branch? In this case we are already done, and the target
 	 * has not been executed yet.
 	 */
-	if (br->entries[0].from == sample->ip)
+	if (entries[0].from == sample->ip)
 		goto out;
-	if (br->entries[0].flags.abort)
+	if (entries[0].flags.abort)
 		goto out;
 
 	/*
@@ -1147,7 +1151,7 @@ static int perf_sample__fprintf_brstackinsn(struct perf_sample *sample,
 	 * between final branch and sample. When this happens just
 	 * continue walking after the last TO until we hit a branch.
 	 */
-	start = br->entries[0].to;
+	start = entries[0].to;
 	end = sample->ip;
 	if (end < start) {
 		/* Missing jump. Scan 128 bytes for the next branch */
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a098c2ebf4ea..ec053dc1e35c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -929,6 +929,10 @@ static struct option stat_options[] = {
 	OPT_BOOLEAN_FLAG(0, "all-user", &stat_config.all_user,
 			 "Configure all used events to run in user space.",
 			 PARSE_OPT_EXCLUSIVE),
+	OPT_BOOLEAN(0, "percore-show-thread", &stat_config.percore_show_thread,
+		    "Use with 'percore' event qualifier to show the event "
+		    "counts of one hardware thread by sum up total hardware "
+		    "threads of same physical core"),
 	OPT_END()
 };
 
diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json
index 5e36bc2468d0..c998e4f1d1d2 100644
--- a/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json
+++ b/tools/perf/pmu-events/arch/s390/cf_z15/crypto6.json
@@ -4,27 +4,27 @@
 		"EventCode": "80",
 		"EventName": "ECC_FUNCTION_COUNT",
 		"BriefDescription": "ECC Function Count",
-		"PublicDescription": "Long ECC function Count"
+		"PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions issued by the CPU."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "81",
 		"EventName": "ECC_CYCLES_COUNT",
 		"BriefDescription": "ECC Cycles Count",
-		"PublicDescription": "Long ECC Function cycles count"
+		"PublicDescription": "This counter counts the total number of CPU cycles when the ECC coprocessor is busy performing the elliptic-curve cryptography (ECC) functions issued by the CPU."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "82",
 		"EventName": "ECC_BLOCKED_FUNCTION_COUNT",
 		"BriefDescription": "Ecc Blocked Function Count",
-		"PublicDescription": "Long ECC blocked function count"
+		"PublicDescription": "This counter counts the total number of the elliptic-curve cryptography (ECC) functions that are issued by the CPU and are blocked because the ECC coprocessor is busy performing a function issued by another CPU."
 	},
 	{
 		"Unit": "CPU-M-CF",
 		"EventCode": "83",
 		"EventName": "ECC_BLOCKED_CYCLES_COUNT",
 		"BriefDescription": "ECC Blocked Cycles Count",
-		"PublicDescription": "Long ECC blocked cycles count"
+		"PublicDescription": "This counter counts the total number of CPU cycles blocked for the elliptic-curve cryptography (ECC) functions issued by the CPU because the ECC coprocessor is busy performing a function issued by another CPU."
 	},
 ]
diff --git a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json
index 89e070727e1b..2df2e231e9ee 100644
--- a/tools/perf/pmu-events/arch/s390/cf_z15/extended.json
+++ b/tools/perf/pmu-events/arch/s390/cf_z15/extended.json
@@ -25,7 +25,7 @@
 		"EventCode": "131",
 		"EventName": "DTLB2_HPAGE_WRITES",
 		"BriefDescription": "DTLB2 One-Megabyte Page Writes",
-		"PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page or a Last Host Translation was done"
+		"PublicDescription": "A translation entry was written into the Combined Region and Segment Table Entry array in the Level-2 TLB for a one-megabyte page"
 	},
 	{
 		"Unit": "CPU-M-CF",
@@ -358,6 +358,34 @@
 	},
 	{
 		"Unit": "CPU-M-CF",
+		"EventCode": "247",
+		"EventName": "DFLT_ACCESS",
+		"BriefDescription": "Cycles CPU spent obtaining access to Deflate unit",
+		"PublicDescription": "Cycles CPU spent obtaining access to Deflate unit"
+	},
+	{
+		"Unit": "CPU-M-CF",
+		"EventCode": "252",
+		"EventName": "DFLT_CYCLES",
+		"BriefDescription": "Cycles CPU is using Deflate unit",
+		"PublicDescription": "Cycles CPU is using Deflate unit"
+	},
+	{
+		"Unit": "CPU-M-CF",
+		"EventCode": "264",
+		"EventName": "DFLT_CC",
+		"BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed",
+		"PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed"
+	},
+	{
+		"Unit": "CPU-M-CF",
+		"EventCode": "265",
+		"EventName": "DFLT_CCERROR",
+		"BriefDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2",
+		"PublicDescription": "Increments by one for every DEFLATE CONVERSION CALL instruction executed that ended in Condition Codes 0, 1 or 2"
+	},
+	{
+		"Unit": "CPU-M-CF",
 		"EventCode": "448",
 		"EventName": "MT_DIAG_CYCLES_ONE_THR_ACTIVE",
 		"BriefDescription": "Cycle count with one thread active",
diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
index f94653229dd4..a728c6e5119b 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
@@ -215,7 +215,8 @@
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
         "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )",
         "MetricGroup": "TLB",
-        "MetricName": "Page_Walks_Utilization"
+        "MetricName": "Page_Walks_Utilization",
+        "MetricConstraint": "NO_NMI_WATCHDOG"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
index e7feb60f9fa9..f97e8316ad2f 100644
--- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json
@@ -215,7 +215,8 @@
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
         "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )",
         "MetricGroup": "TLB",
-        "MetricName": "Page_Walks_Utilization"
+        "MetricName": "Page_Walks_Utilization",
+        "MetricConstraint": "NO_NMI_WATCHDOG"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
index 21d7a0c2c2e8..35f5db1786f7 100644
--- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json
@@ -215,7 +215,8 @@
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
         "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING + EPT.WALK_PENDING ) / ( 2 * cycles )",
         "MetricGroup": "TLB",
-        "MetricName": "Page_Walks_Utilization"
+        "MetricName": "Page_Walks_Utilization",
+        "MetricConstraint": "NO_NMI_WATCHDOG"
     },
     {
         "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index 27b4da80f751..3c4236a5bad8 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -323,7 +323,7 @@ static int print_events_table_entry(void *data, char *name, char *event,
 				    char *pmu, char *unit, char *perpkg,
 				    char *metric_expr,
 				    char *metric_name, char *metric_group,
-				    char *deprecated)
+				    char *deprecated, char *metric_constraint)
 {
 	struct perf_entry_data *pd = data;
 	FILE *outfp = pd->outfp;
@@ -357,6 +357,8 @@ static int print_events_table_entry(void *data, char *name, char *event,
 		fprintf(outfp, "\t.metric_group = \"%s\",\n", metric_group);
 	if (deprecated)
 		fprintf(outfp, "\t.deprecated = \"%s\",\n", deprecated);
+	if (metric_constraint)
+		fprintf(outfp, "\t.metric_constraint = \"%s\",\n", metric_constraint);
 	fprintf(outfp, "},\n");
 
 	return 0;
@@ -375,6 +377,7 @@ struct event_struct {
 	char *metric_name;
 	char *metric_group;
 	char *deprecated;
+	char *metric_constraint;
 };
 
 #define ADD_EVENT_FIELD(field) do { if (field) {		\
@@ -422,7 +425,7 @@ static int save_arch_std_events(void *data, char *name, char *event,
 				char *desc, char *long_desc, char *pmu,
 				char *unit, char *perpkg, char *metric_expr,
 				char *metric_name, char *metric_group,
-				char *deprecated)
+				char *deprecated, char *metric_constraint)
 {
 	struct event_struct *es;
 
@@ -486,7 +489,7 @@ try_fixup(const char *fn, char *arch_std, char **event, char **desc,
 	  char **name, char **long_desc, char **pmu, char **filter,
 	  char **perpkg, char **unit, char **metric_expr, char **metric_name,
 	  char **metric_group, unsigned long long eventcode,
-	  char **deprecated)
+	  char **deprecated, char **metric_constraint)
 {
 	/* try to find matching event from arch standard values */
 	struct event_struct *es;
@@ -515,7 +518,7 @@ int json_events(const char *fn,
 		      char *pmu, char *unit, char *perpkg,
 		      char *metric_expr,
 		      char *metric_name, char *metric_group,
-		      char *deprecated),
+		      char *deprecated, char *metric_constraint),
 	  void *data)
 {
 	int err;
@@ -545,6 +548,7 @@ int json_events(const char *fn,
 		char *metric_name = NULL;
 		char *metric_group = NULL;
 		char *deprecated = NULL;
+		char *metric_constraint = NULL;
 		char *arch_std = NULL;
 		unsigned long long eventcode = 0;
 		struct msrmap *msr = NULL;
@@ -629,6 +633,8 @@ int json_events(const char *fn,
 				addfield(map, &metric_name, "", "", val);
 			} else if (json_streq(map, field, "MetricGroup")) {
 				addfield(map, &metric_group, "", "", val);
+			} else if (json_streq(map, field, "MetricConstraint")) {
+				addfield(map, &metric_constraint, "", "", val);
 			} else if (json_streq(map, field, "MetricExpr")) {
 				addfield(map, &metric_expr, "", "", val);
 				for (s = metric_expr; *s; s++)
@@ -670,13 +676,13 @@ int json_events(const char *fn,
 					&long_desc, &pmu, &filter, &perpkg,
 					&unit, &metric_expr, &metric_name,
 					&metric_group, eventcode,
-					&deprecated);
+					&deprecated, &metric_constraint);
 			if (err)
 				goto free_strings;
 		}
 		err = func(data, name, real_event(name, event), desc, long_desc,
 			   pmu, unit, perpkg, metric_expr, metric_name,
-			   metric_group, deprecated);
+			   metric_group, deprecated, metric_constraint);
 free_strings:
 		free(event);
 		free(desc);
@@ -691,6 +697,7 @@ free_strings:
 		free(metric_expr);
 		free(metric_name);
 		free(metric_group);
+		free(metric_constraint);
 		free(arch_std);
 
 		if (err)
diff --git a/tools/perf/pmu-events/jevents.h b/tools/perf/pmu-events/jevents.h
index 5cda49a42143..2afc8304529e 100644
--- a/tools/perf/pmu-events/jevents.h
+++ b/tools/perf/pmu-events/jevents.h
@@ -8,7 +8,7 @@ int json_events(const char *fn,
 				char *pmu,
 				char *unit, char *perpkg, char *metric_expr,
 				char *metric_name, char *metric_group,
-				char *deprecated),
+				char *deprecated, char *metric_constraint),
 		void *data);
 char *get_cpu_str(void);
 
diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h
index caeb577d36c9..53e76d5d5b37 100644
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -18,6 +18,7 @@ struct pmu_event {
 	const char *metric_name;
 	const char *metric_group;
 	const char *deprecated;
+	const char *metric_constraint;
 };
 
 /*
diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl
index 4e7076c20616..d307ce8fd6ed 100644
--- a/tools/perf/scripts/perl/check-perf-trace.pl
+++ b/tools/perf/scripts/perl/check-perf-trace.pl
@@ -28,7 +28,7 @@ sub trace_end
 sub irq::softirq_entry
 {
 	my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	    $common_pid, $common_comm,
+	    $common_pid, $common_comm, $common_callchain,
 	    $vec) = @_;
 
 	print_header($event_name, $common_cpu, $common_secs, $common_nsecs,
@@ -43,7 +43,7 @@ sub irq::softirq_entry
 sub kmem::kmalloc
 {
 	my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	    $common_pid, $common_comm,
+	    $common_pid, $common_comm, $common_callchain,
 	    $call_site, $ptr, $bytes_req, $bytes_alloc,
 	    $gfp_flags) = @_;
 
@@ -92,7 +92,7 @@ sub print_unhandled
 sub trace_unhandled
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
+	$common_pid, $common_comm, $common_callchain) = @_;
 
     $unhandled{$event_name}++;
 }
diff --git a/tools/perf/scripts/perl/failed-syscalls.pl b/tools/perf/scripts/perl/failed-syscalls.pl
index 55e7ae4c5c88..05954a8f363a 100644
--- a/tools/perf/scripts/perl/failed-syscalls.pl
+++ b/tools/perf/scripts/perl/failed-syscalls.pl
@@ -18,7 +18,7 @@ my %failed_syscalls;
 sub raw_syscalls::sys_exit
 {
 	my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	    $common_pid, $common_comm,
+	    $common_pid, $common_comm, $common_callchain,
 	    $id, $ret) = @_;
 
 	if ($ret < 0) {
diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl
index 168fa5e94b44..92a750b8552b 100644
--- a/tools/perf/scripts/perl/rw-by-file.pl
+++ b/tools/perf/scripts/perl/rw-by-file.pl
@@ -28,7 +28,7 @@ my %writes;
 sub syscalls::sys_enter_read
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm, $nr, $fd, $buf, $count) = @_;
+	$common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_;
 
     if ($common_comm eq $for_comm) {
 	$reads{$fd}{bytes_requested} += $count;
@@ -39,7 +39,7 @@ sub syscalls::sys_enter_read
 sub syscalls::sys_enter_write
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm, $nr, $fd, $buf, $count) = @_;
+	$common_pid, $common_comm, $common_callchain, $nr, $fd, $buf, $count) = @_;
 
     if ($common_comm eq $for_comm) {
 	$writes{$fd}{bytes_written} += $count;
@@ -98,7 +98,7 @@ sub print_unhandled
 sub trace_unhandled
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
+	$common_pid, $common_comm, $common_callchain) = @_;
 
     $unhandled{$event_name}++;
 }
diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl
index 495698250b2f..d789fe39caab 100644
--- a/tools/perf/scripts/perl/rw-by-pid.pl
+++ b/tools/perf/scripts/perl/rw-by-pid.pl
@@ -24,7 +24,7 @@ my %writes;
 sub syscalls::sys_exit_read
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $ret) = @_;
 
     if ($ret > 0) {
@@ -40,7 +40,7 @@ sub syscalls::sys_exit_read
 sub syscalls::sys_enter_read
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $fd, $buf, $count) = @_;
 
     $reads{$common_pid}{bytes_requested} += $count;
@@ -51,7 +51,7 @@ sub syscalls::sys_enter_read
 sub syscalls::sys_exit_write
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $ret) = @_;
 
     if ($ret <= 0) {
@@ -62,7 +62,7 @@ sub syscalls::sys_exit_write
 sub syscalls::sys_enter_write
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $fd, $buf, $count) = @_;
 
     $writes{$common_pid}{bytes_written} += $count;
@@ -178,7 +178,7 @@ sub print_unhandled
 sub trace_unhandled
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
+	$common_pid, $common_comm, $common_callchain) = @_;
 
     $unhandled{$event_name}++;
 }
diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl
index 6473442568a2..eba4df67af6b 100644
--- a/tools/perf/scripts/perl/rwtop.pl
+++ b/tools/perf/scripts/perl/rwtop.pl
@@ -35,7 +35,7 @@ if (!$interval) {
 sub syscalls::sys_exit_read
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $ret) = @_;
 
     print_check();
@@ -53,7 +53,7 @@ sub syscalls::sys_exit_read
 sub syscalls::sys_enter_read
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $fd, $buf, $count) = @_;
 
     print_check();
@@ -66,7 +66,7 @@ sub syscalls::sys_enter_read
 sub syscalls::sys_exit_write
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $ret) = @_;
 
     print_check();
@@ -79,7 +79,7 @@ sub syscalls::sys_exit_write
 sub syscalls::sys_enter_write
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$nr, $fd, $buf, $count) = @_;
 
     print_check();
@@ -197,7 +197,7 @@ sub print_unhandled
 sub trace_unhandled
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
+	$common_pid, $common_comm, $common_callchain) = @_;
 
     $unhandled{$event_name}++;
 }
diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl
index efcfec5e347a..53444ff4ec7f 100644
--- a/tools/perf/scripts/perl/wakeup-latency.pl
+++ b/tools/perf/scripts/perl/wakeup-latency.pl
@@ -28,7 +28,7 @@ my $total_wakeups = 0;
 sub sched::sched_switch
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$prev_comm, $prev_pid, $prev_prio, $prev_state, $next_comm, $next_pid,
 	$next_prio) = @_;
 
@@ -51,7 +51,7 @@ sub sched::sched_switch
 sub sched::sched_wakeup
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
+	$common_pid, $common_comm, $common_callchain,
 	$comm, $pid, $prio, $success, $target_cpu) = @_;
 
     $last_wakeup{$target_cpu}{ts} = nsecs($common_secs, $common_nsecs);
@@ -101,7 +101,7 @@ sub print_unhandled
 sub trace_unhandled
 {
     my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
+	$common_pid, $common_comm, $common_callchain) = @_;
 
     $unhandled{$event_name}++;
 }
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 5f05db75cdd8..54d9516c9839 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -543,8 +543,11 @@ static int run_shell_tests(int argc, const char *argv[], int i, int width)
 		return -1;
 
 	dir = opendir(st.dir);
-	if (!dir)
+	if (!dir) {
+		pr_err("failed to open shell test directory: %s\n",
+			st.dir);
 		return -1;
+	}
 
 	for_each_shell_test(dir, st.dir, ent) {
 		int curr = i++;
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
index 87843af4c118..28313e59d6f6 100644
--- a/tools/perf/tests/expr.c
+++ b/tools/perf/tests/expr.c
@@ -10,7 +10,7 @@ static int test(struct parse_ctx *ctx, const char *e, double val2)
 {
 	double val;
 
-	if (expr__parse(&val, ctx, &e))
+	if (expr__parse(&val, ctx, e))
 		TEST_ASSERT_VAL("parse test failed", 0);
 	TEST_ASSERT_VAL("unexpected value", val == val2);
 	return 0;
@@ -44,12 +44,12 @@ int test__expr(struct test *t __maybe_unused, int subtest __maybe_unused)
 		return ret;
 
 	p = "FOO/0";
-	ret = expr__parse(&val, &ctx, &p);
-	TEST_ASSERT_VAL("division by zero", ret == 1);
+	ret = expr__parse(&val, &ctx, p);
+	TEST_ASSERT_VAL("division by zero", ret == -1);
 
 	p = "BAR/";
-	ret = expr__parse(&val, &ctx, &p);
-	TEST_ASSERT_VAL("missing operand", ret == 1);
+	ret = expr__parse(&val, &ctx, p);
+	TEST_ASSERT_VAL("missing operand", ret == -1);
 
 	TEST_ASSERT_VAL("find other",
 			expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other) == 0);
diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 2762e1155238..14239e472187 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -99,6 +99,7 @@ static bool samples_same(const struct perf_sample *s1,
 
 	if (type & PERF_SAMPLE_BRANCH_STACK) {
 		COMP(branch_stack->nr);
+		COMP(branch_stack->hw_idx);
 		for (i = 0; i < s1->branch_stack->nr; i++)
 			MCOMP(branch_stack->entries[i]);
 	}
@@ -186,7 +187,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
 		u64 data[64];
 	} branch_stack = {
 		/* 1 branch_entry */
-		.data = {1, 211, 212, 213},
+		.data = {1, -1ULL, 211, 212, 213},
 	};
 	u64 regs[64];
 	const u64 raw_data[] = {0x123456780a0b0c0dULL, 0x1102030405060708ULL};
@@ -208,6 +209,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
 		.transaction	= 112,
 		.raw_data	= (void *)raw_data,
 		.callchain	= &callchain.callchain,
+		.no_hw_idx      = false,
 		.branch_stack	= &branch_stack.branch_stack,
 		.user_regs	= {
 			.abi	= PERF_SAMPLE_REGS_ABI_64,
@@ -244,6 +246,9 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
 	if (sample_type & PERF_SAMPLE_REGS_INTR)
 		evsel.core.attr.sample_regs_intr = sample_regs;
 
+	if (sample_type & PERF_SAMPLE_BRANCH_STACK)
+		evsel.core.attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
+
 	for (i = 0; i < sizeof(regs); i++)
 		*(i + (u8 *)regs) = i & 0xfe;
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 07da6c790b63..c0cf8dff694e 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -121,7 +121,9 @@ perf-y += mem-events.o
 perf-y += vsprintf.o
 perf-y += units.o
 perf-y += time-utils.o
+perf-y += expr-flex.o
 perf-y += expr-bison.o
+perf-y += expr.o
 perf-y += branch.o
 perf-y += mem2node.o
 
@@ -189,9 +191,13 @@ $(OUTPUT)util/parse-events-bison.c: util/parse-events.y
 	$(call rule_mkdir)
 	$(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
 
+$(OUTPUT)util/expr-flex.c: util/expr.l $(OUTPUT)util/expr-bison.c
+	$(call rule_mkdir)
+	$(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/expr-flex.h $(PARSER_DEBUG_FLEX) util/expr.l
+
 $(OUTPUT)util/expr-bison.c: util/expr.y
 	$(call rule_mkdir)
-	$(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr__
+	$(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr_
 
 $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
 	$(call rule_mkdir)
@@ -203,12 +209,14 @@ $(OUTPUT)util/pmu-bison.c: util/pmu.y
 
 CFLAGS_parse-events-flex.o  += -w
 CFLAGS_pmu-flex.o           += -w
+CFLAGS_expr-flex.o          += -w
 CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w
 CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 CFLAGS_expr-bison.o         += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
 $(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
+$(OUTPUT)util/expr.o: $(OUTPUT)util/expr-flex.c $(OUTPUT)util/expr-bison.c
 
 CFLAGS_bitmap.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_find_bit.o      += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
@@ -216,6 +224,7 @@ CFLAGS_rbtree.o        += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ET
 CFLAGS_libstring.o     += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_hweight.o       += -Wno-unused-parameter -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 CFLAGS_parse-events.o  += -Wno-redundant-decls
+CFLAGS_expr.o          += -Wno-redundant-decls
 CFLAGS_header.o        += -include $(OUTPUT)PERF-VERSION-FILE
 
 $(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c FORCE
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 0ea95be84b3b..f1ea0d61eb5b 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2611,8 +2611,6 @@ void annotation__mark_jump_targets(struct annotation *notes, struct symbol *sym)
 
 		if (++al->jump_sources > notes->max_jump_sources)
 			notes->max_jump_sources = al->jump_sources;
-
-		++notes->nr_jumps;
 	}
 }
 
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 001258601a37..07c775938d46 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -279,7 +279,6 @@ struct annotation {
 	struct annotation_options *options;
 	struct annotation_line	**offsets;
 	int			nr_events;
-	int			nr_jumps;
 	int			max_jump_sources;
 	int			nr_entries;
 	int			nr_asm_entries;
diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c
index fbbb6d640dad..423ec69bda6c 100644
--- a/tools/perf/util/block-info.c
+++ b/tools/perf/util/block-info.c
@@ -65,8 +65,7 @@ struct block_info *block_info__new(void)
 	return bi;
 }
 
-int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused,
-			struct hist_entry *left, struct hist_entry *right)
+int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	struct block_info *bi_l = left->block_info;
 	struct block_info *bi_r = right->block_info;
@@ -74,30 +73,27 @@ int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 
 	if (!bi_l->sym || !bi_r->sym) {
 		if (!bi_l->sym && !bi_r->sym)
-			return 0;
+			return -1;
 		else if (!bi_l->sym)
 			return -1;
 		else
 			return 1;
 	}
 
-	if (bi_l->sym == bi_r->sym) {
-		if (bi_l->start == bi_r->start) {
-			if (bi_l->end == bi_r->end)
-				return 0;
-			else
-				return (int64_t)(bi_r->end - bi_l->end);
-		} else
-			return (int64_t)(bi_r->start - bi_l->start);
-	} else {
-		cmp = strcmp(bi_l->sym->name, bi_r->sym->name);
+	cmp = strcmp(bi_l->sym->name, bi_r->sym->name);
+	if (cmp)
 		return cmp;
-	}
 
-	if (bi_l->sym->start != bi_r->sym->start)
-		return (int64_t)(bi_r->sym->start - bi_l->sym->start);
+	if (bi_l->start != bi_r->start)
+		return (int64_t)(bi_r->start - bi_l->start);
 
-	return (int64_t)(bi_r->sym->end - bi_l->sym->end);
+	return (int64_t)(bi_r->end - bi_l->end);
+}
+
+int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused,
+			struct hist_entry *left, struct hist_entry *right)
+{
+	return __block_info__cmp(left, right);
 }
 
 static void init_block_info(struct block_info *bi, struct symbol *sym,
@@ -185,6 +181,17 @@ static int block_column_width(struct perf_hpp_fmt *fmt,
 	return block_fmt->width;
 }
 
+static int color_pct(struct perf_hpp *hpp, int width, double pct)
+{
+#ifdef HAVE_SLANG_SUPPORT
+	if (use_browser) {
+		return __hpp__slsmg_color_printf(hpp, "%*.2f%%",
+						 width - 1, pct);
+	}
+#endif
+	return hpp_color_scnprintf(hpp, "%*.2f%%", width - 1, pct);
+}
+
 static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt,
 					struct perf_hpp *hpp,
 					struct hist_entry *he)
@@ -192,14 +199,11 @@ static int block_total_cycles_pct_entry(struct perf_hpp_fmt *fmt,
 	struct block_fmt *block_fmt = container_of(fmt, struct block_fmt, fmt);
 	struct block_info *bi = he->block_info;
 	double ratio = 0.0;
-	char buf[16];
 
 	if (block_fmt->total_cycles)
 		ratio = (double)bi->cycles / (double)block_fmt->total_cycles;
 
-	sprintf(buf, "%.2f%%", 100.0 * ratio);
-
-	return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf);
+	return color_pct(hpp, block_fmt->width, 100.0 * ratio);
 }
 
 static int64_t block_total_cycles_pct_sort(struct perf_hpp_fmt *fmt,
@@ -252,16 +256,13 @@ static int block_cycles_pct_entry(struct perf_hpp_fmt *fmt,
 	struct block_info *bi = he->block_info;
 	double ratio = 0.0;
 	u64 avg;
-	char buf[16];
 
 	if (block_fmt->block_cycles && bi->num_aggr) {
 		avg = bi->cycles_aggr / bi->num_aggr;
 		ratio = (double)avg / (double)block_fmt->block_cycles;
 	}
 
-	sprintf(buf, "%.2f%%", 100.0 * ratio);
-
-	return scnprintf(hpp->buf, hpp->size, "%*s", block_fmt->width, buf);
+	return color_pct(hpp, block_fmt->width, 100.0 * ratio);
 }
 
 static int block_avg_cycles_entry(struct perf_hpp_fmt *fmt,
@@ -349,7 +350,7 @@ static void hpp_register(struct block_fmt *block_fmt, int idx,
 
 	switch (idx) {
 	case PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT:
-		fmt->entry = block_total_cycles_pct_entry;
+		fmt->color = block_total_cycles_pct_entry;
 		fmt->cmp = block_info__cmp;
 		fmt->sort = block_total_cycles_pct_sort;
 		break;
@@ -357,7 +358,7 @@ static void hpp_register(struct block_fmt *block_fmt, int idx,
 		fmt->entry = block_cycles_lbr_entry;
 		break;
 	case PERF_HPP_REPORT__BLOCK_CYCLES_PCT:
-		fmt->entry = block_cycles_pct_entry;
+		fmt->color = block_cycles_pct_entry;
 		break;
 	case PERF_HPP_REPORT__BLOCK_AVG_CYCLES:
 		fmt->entry = block_avg_cycles_entry;
@@ -377,33 +378,41 @@ static void hpp_register(struct block_fmt *block_fmt, int idx,
 }
 
 static void register_block_columns(struct perf_hpp_list *hpp_list,
-				   struct block_fmt *block_fmts)
+				   struct block_fmt *block_fmts,
+				   int *block_hpps, int nr_hpps)
 {
-	for (int i = 0; i < PERF_HPP_REPORT__BLOCK_MAX_INDEX; i++)
-		hpp_register(&block_fmts[i], i, hpp_list);
+	for (int i = 0; i < nr_hpps; i++)
+		hpp_register(&block_fmts[i], block_hpps[i], hpp_list);
 }
 
-static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts)
+static void init_block_hist(struct block_hist *bh, struct block_fmt *block_fmts,
+			    int *block_hpps, int nr_hpps)
 {
 	__hists__init(&bh->block_hists, &bh->block_list);
 	perf_hpp_list__init(&bh->block_list);
 	bh->block_list.nr_header_lines = 1;
 
-	register_block_columns(&bh->block_list, block_fmts);
+	register_block_columns(&bh->block_list, block_fmts,
+			       block_hpps, nr_hpps);
 
-	perf_hpp_list__register_sort_field(&bh->block_list,
-		&block_fmts[PERF_HPP_REPORT__BLOCK_TOTAL_CYCLES_PCT].fmt);
+	/* Sort by the first fmt */
+	perf_hpp_list__register_sort_field(&bh->block_list, &block_fmts[0].fmt);
 }
 
-static void process_block_report(struct hists *hists,
-				 struct block_report *block_report,
-				 u64 total_cycles)
+static int process_block_report(struct hists *hists,
+				struct block_report *block_report,
+				u64 total_cycles, int *block_hpps,
+				int nr_hpps)
 {
 	struct rb_node *next = rb_first_cached(&hists->entries);
 	struct block_hist *bh = &block_report->hist;
 	struct hist_entry *he;
 
-	init_block_hist(bh, block_report->fmts);
+	if (nr_hpps > PERF_HPP_REPORT__BLOCK_MAX_INDEX)
+		return -1;
+
+	block_report->nr_fmts = nr_hpps;
+	init_block_hist(bh, block_report->fmts, block_hpps, nr_hpps);
 
 	while (next) {
 		he = rb_entry(next, struct hist_entry, rb_node);
@@ -412,16 +421,19 @@ static void process_block_report(struct hists *hists,
 		next = rb_next(&he->rb_node);
 	}
 
-	for (int i = 0; i < PERF_HPP_REPORT__BLOCK_MAX_INDEX; i++) {
+	for (int i = 0; i < nr_hpps; i++) {
 		block_report->fmts[i].total_cycles = total_cycles;
 		block_report->fmts[i].block_cycles = block_report->cycles;
 	}
 
 	hists__output_resort(&bh->block_hists, NULL);
+	return 0;
 }
 
 struct block_report *block_info__create_report(struct evlist *evlist,
-					       u64 total_cycles)
+					       u64 total_cycles,
+					       int *block_hpps, int nr_hpps,
+					       int *nr_reps)
 {
 	struct block_report *block_reports;
 	int nr_hists = evlist->core.nr_entries, i = 0;
@@ -434,13 +446,23 @@ struct block_report *block_info__create_report(struct evlist *evlist,
 	evlist__for_each_entry(evlist, pos) {
 		struct hists *hists = evsel__hists(pos);
 
-		process_block_report(hists, &block_reports[i], total_cycles);
+		process_block_report(hists, &block_reports[i], total_cycles,
+				     block_hpps, nr_hpps);
 		i++;
 	}
 
+	*nr_reps = nr_hists;
 	return block_reports;
 }
 
+void block_info__free_report(struct block_report *reps, int nr_reps)
+{
+	for (int i = 0; i < nr_reps; i++)
+		hists__delete_entries(&reps[i].hist.block_hists);
+
+	free(reps);
+}
+
 int report__browse_block_hists(struct block_hist *bh, float min_percent,
 			       struct evsel *evsel, struct perf_env *env,
 			       struct annotation_options *annotation_opts)
@@ -452,13 +474,11 @@ int report__browse_block_hists(struct block_hist *bh, float min_percent,
 		symbol_conf.report_individual_block = true;
 		hists__fprintf(&bh->block_hists, true, 0, 0, min_percent,
 			       stdout, true);
-		hists__delete_entries(&bh->block_hists);
 		return 0;
 	case 1:
 		symbol_conf.report_individual_block = true;
 		ret = block_hists_tui_browse(bh, evsel, min_percent,
 					     env, annotation_opts);
-		hists__delete_entries(&bh->block_hists);
 		return ret;
 	default:
 		return -1;
diff --git a/tools/perf/util/block-info.h b/tools/perf/util/block-info.h
index bef0d75e9819..42e9dcc4cf0a 100644
--- a/tools/perf/util/block-info.h
+++ b/tools/perf/util/block-info.h
@@ -45,6 +45,7 @@ struct block_report {
 	struct block_hist	hist;
 	u64			cycles;
 	struct block_fmt	fmts[PERF_HPP_REPORT__BLOCK_MAX_INDEX];
+	int			nr_fmts;
 };
 
 struct block_hist;
@@ -61,6 +62,8 @@ static inline void __block_info__zput(struct block_info **bi)
 
 #define block_info__zput(bi) __block_info__zput(&bi)
 
+int64_t __block_info__cmp(struct hist_entry *left, struct hist_entry *right);
+
 int64_t block_info__cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 			struct hist_entry *left, struct hist_entry *right);
 
@@ -68,7 +71,11 @@ int block_info__process_sym(struct hist_entry *he, struct block_hist *bh,
 			    u64 *block_cycles_aggr, u64 total_cycles);
 
 struct block_report *block_info__create_report(struct evlist *evlist,
-					       u64 total_cycles);
+					       u64 total_cycles,
+					       int *block_hpps, int nr_hpps,
+					       int *nr_reps);
+
+void block_info__free_report(struct block_report *reps, int nr_reps);
 
 int report__browse_block_hists(struct block_hist *bh, float min_percent,
 			       struct evsel *evsel, struct perf_env *env,
diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h
index 88e00d268f6f..154a05cd03af 100644
--- a/tools/perf/util/branch.h
+++ b/tools/perf/util/branch.h
@@ -12,6 +12,7 @@
 #include <linux/stddef.h>
 #include <linux/perf_event.h>
 #include <linux/types.h>
+#include "event.h"
 
 struct branch_flags {
 	u64 mispred:1;
@@ -39,9 +40,30 @@ struct branch_entry {
 
 struct branch_stack {
 	u64			nr;
+	u64			hw_idx;
 	struct branch_entry	entries[0];
 };
 
+/*
+ * The hw_idx is only available when PERF_SAMPLE_BRANCH_HW_INDEX is applied.
+ * Otherwise, the output format of a sample with branch stack is
+ * struct branch_stack {
+ *	u64			nr;
+ *	struct branch_entry	entries[0];
+ * }
+ * Check whether the hw_idx is available,
+ * and return the corresponding pointer of entries[0].
+ */
+static inline struct branch_entry *perf_sample__branch_entries(struct perf_sample *sample)
+{
+	u64 *entry = (u64 *)sample->branch_stack;
+
+	entry++;
+	if (sample->no_hw_idx)
+		return (struct branch_entry *)entry;
+	return (struct branch_entry *)(++entry);
+}
+
 struct branch_type_stat {
 	bool	branch_to;
 	u64	counts[PERF_BR_MAX];
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 4881d4af3381..5bc9d3b01bd9 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -3,75 +3,16 @@
 #include "evsel.h"
 #include "cgroup.h"
 #include "evlist.h"
-#include <linux/stringify.h>
 #include <linux/zalloc.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <string.h>
+#include <api/fs/fs.h>
 
 int nr_cgroups;
 
-static int
-cgroupfs_find_mountpoint(char *buf, size_t maxlen)
-{
-	FILE *fp;
-	char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
-	char path_v1[PATH_MAX + 1], path_v2[PATH_MAX + 2], *path;
-	char *token, *saved_ptr = NULL;
-
-	fp = fopen("/proc/mounts", "r");
-	if (!fp)
-		return -1;
-
-	/*
-	 * in order to handle split hierarchy, we need to scan /proc/mounts
-	 * and inspect every cgroupfs mount point to find one that has
-	 * perf_event subsystem
-	 */
-	path_v1[0] = '\0';
-	path_v2[0] = '\0';
-
-	while (fscanf(fp, "%*s %"__stringify(PATH_MAX)"s %"__stringify(PATH_MAX)"s %"
-				__stringify(PATH_MAX)"s %*d %*d\n",
-				mountpoint, type, tokens) == 3) {
-
-		if (!path_v1[0] && !strcmp(type, "cgroup")) {
-
-			token = strtok_r(tokens, ",", &saved_ptr);
-
-			while (token != NULL) {
-				if (!strcmp(token, "perf_event")) {
-					strcpy(path_v1, mountpoint);
-					break;
-				}
-				token = strtok_r(NULL, ",", &saved_ptr);
-			}
-		}
-
-		if (!path_v2[0] && !strcmp(type, "cgroup2"))
-			strcpy(path_v2, mountpoint);
-
-		if (path_v1[0] && path_v2[0])
-			break;
-	}
-	fclose(fp);
-
-	if (path_v1[0])
-		path = path_v1;
-	else if (path_v2[0])
-		path = path_v2;
-	else
-		return -1;
-
-	if (strlen(path) < maxlen) {
-		strcpy(buf, path);
-		return 0;
-	}
-	return -1;
-}
-
 static int open_cgroup(const char *name)
 {
 	char path[PATH_MAX + 1];
@@ -79,7 +20,7 @@ static int open_cgroup(const char *name)
 	int fd;
 
 
-	if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1))
+	if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1, "perf_event"))
 		return -1;
 
 	scnprintf(path, PATH_MAX, "%s/%s", mnt, name);
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index 5471045ebf5c..62d2f9b9ce1b 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -363,6 +363,23 @@ struct cs_etm_packet_queue
 	return NULL;
 }
 
+static void cs_etm__packet_swap(struct cs_etm_auxtrace *etm,
+				struct cs_etm_traceid_queue *tidq)
+{
+	struct cs_etm_packet *tmp;
+
+	if (etm->sample_branches || etm->synth_opts.last_branch ||
+	    etm->sample_instructions) {
+		/*
+		 * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for
+		 * the next incoming packet.
+		 */
+		tmp = tidq->packet;
+		tidq->packet = tidq->prev_packet;
+		tidq->prev_packet = tmp;
+	}
+}
+
 static void cs_etm__packet_dump(const char *pkt_string)
 {
 	const char *color = PERF_COLOR_BLUE;
@@ -945,7 +962,7 @@ static inline u64 cs_etm__instr_addr(struct cs_etm_queue *etmq,
 	if (packet->isa == CS_ETM_ISA_T32) {
 		u64 addr = packet->start_addr;
 
-		while (offset > 0) {
+		while (offset) {
 			addr += cs_etm__t32_instr_size(etmq,
 						       trace_chan_id, addr);
 			offset--;
@@ -1134,10 +1151,8 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
 
 	cs_etm__copy_insn(etmq, tidq->trace_chan_id, tidq->packet, &sample);
 
-	if (etm->synth_opts.last_branch) {
-		cs_etm__copy_last_branch_rb(etmq, tidq);
+	if (etm->synth_opts.last_branch)
 		sample.branch_stack = tidq->last_branch;
-	}
 
 	if (etm->synth_opts.inject) {
 		ret = cs_etm__inject_event(event, &sample,
@@ -1153,9 +1168,6 @@ static int cs_etm__synth_instruction_sample(struct cs_etm_queue *etmq,
 			"CS ETM Trace: failed to deliver instruction event, error %d\n",
 			ret);
 
-	if (etm->synth_opts.last_branch)
-		cs_etm__reset_last_branch_rb(tidq);
-
 	return ret;
 }
 
@@ -1172,6 +1184,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
 	union perf_event *event = tidq->event_buf;
 	struct dummy_branch_stack {
 		u64			nr;
+		u64			hw_idx;
 		struct branch_entry	entries;
 	} dummy_bs;
 	u64 ip;
@@ -1202,6 +1215,7 @@ static int cs_etm__synth_branch_sample(struct cs_etm_queue *etmq,
 	if (etm->synth_opts.last_branch) {
 		dummy_bs = (struct dummy_branch_stack){
 			.nr = 1,
+			.hw_idx = -1ULL,
 			.entries = {
 				.from = sample.ip,
 				.to = sample.addr,
@@ -1340,12 +1354,14 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
 			  struct cs_etm_traceid_queue *tidq)
 {
 	struct cs_etm_auxtrace *etm = etmq->etm;
-	struct cs_etm_packet *tmp;
 	int ret;
 	u8 trace_chan_id = tidq->trace_chan_id;
-	u64 instrs_executed = tidq->packet->instr_count;
+	u64 instrs_prev;
+
+	/* Get instructions remainder from previous packet */
+	instrs_prev = tidq->period_instructions;
 
-	tidq->period_instructions += instrs_executed;
+	tidq->period_instructions += tidq->packet->instr_count;
 
 	/*
 	 * Record a branch when the last instruction in
@@ -1363,26 +1379,80 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
 		 * TODO: allow period to be defined in cycles and clock time
 		 */
 
-		/* Get number of instructions executed after the sample point */
-		u64 instrs_over = tidq->period_instructions -
-			etm->instructions_sample_period;
+		/*
+		 * Below diagram demonstrates the instruction samples
+		 * generation flows:
+		 *
+		 *    Instrs     Instrs       Instrs       Instrs
+		 *   Sample(n)  Sample(n+1)  Sample(n+2)  Sample(n+3)
+		 *    |            |            |            |
+		 *    V            V            V            V
+		 *   --------------------------------------------------
+		 *            ^                                  ^
+		 *            |                                  |
+		 *         Period                             Period
+		 *    instructions(Pi)                   instructions(Pi')
+		 *
+		 *            |                                  |
+		 *            \---------------- -----------------/
+		 *                             V
+		 *                 tidq->packet->instr_count
+		 *
+		 * Instrs Sample(n...) are the synthesised samples occurring
+		 * every etm->instructions_sample_period instructions - as
+		 * defined on the perf command line.  Sample(n) is being the
+		 * last sample before the current etm packet, n+1 to n+3
+		 * samples are generated from the current etm packet.
+		 *
+		 * tidq->packet->instr_count represents the number of
+		 * instructions in the current etm packet.
+		 *
+		 * Period instructions (Pi) contains the the number of
+		 * instructions executed after the sample point(n) from the
+		 * previous etm packet.  This will always be less than
+		 * etm->instructions_sample_period.
+		 *
+		 * When generate new samples, it combines with two parts
+		 * instructions, one is the tail of the old packet and another
+		 * is the head of the new coming packet, to generate
+		 * sample(n+1); sample(n+2) and sample(n+3) consume the
+		 * instructions with sample period.  After sample(n+3), the rest
+		 * instructions will be used by later packet and it is assigned
+		 * to tidq->period_instructions for next round calculation.
+		 */
 
 		/*
-		 * Calculate the address of the sampled instruction (-1 as
-		 * sample is reported as though instruction has just been
-		 * executed, but PC has not advanced to next instruction)
+		 * Get the initial offset into the current packet instructions;
+		 * entry conditions ensure that instrs_prev is less than
+		 * etm->instructions_sample_period.
 		 */
-		u64 offset = (instrs_executed - instrs_over - 1);
-		u64 addr = cs_etm__instr_addr(etmq, trace_chan_id,
-					      tidq->packet, offset);
+		u64 offset = etm->instructions_sample_period - instrs_prev;
+		u64 addr;
 
-		ret = cs_etm__synth_instruction_sample(
-			etmq, tidq, addr, etm->instructions_sample_period);
-		if (ret)
-			return ret;
+		/* Prepare last branches for instruction sample */
+		if (etm->synth_opts.last_branch)
+			cs_etm__copy_last_branch_rb(etmq, tidq);
 
-		/* Carry remaining instructions into next sample period */
-		tidq->period_instructions = instrs_over;
+		while (tidq->period_instructions >=
+				etm->instructions_sample_period) {
+			/*
+			 * Calculate the address of the sampled instruction (-1
+			 * as sample is reported as though instruction has just
+			 * been executed, but PC has not advanced to next
+			 * instruction)
+			 */
+			addr = cs_etm__instr_addr(etmq, trace_chan_id,
+						  tidq->packet, offset - 1);
+			ret = cs_etm__synth_instruction_sample(
+				etmq, tidq, addr,
+				etm->instructions_sample_period);
+			if (ret)
+				return ret;
+
+			offset += etm->instructions_sample_period;
+			tidq->period_instructions -=
+				etm->instructions_sample_period;
+		}
 	}
 
 	if (etm->sample_branches) {
@@ -1404,15 +1474,7 @@ static int cs_etm__sample(struct cs_etm_queue *etmq,
 		}
 	}
 
-	if (etm->sample_branches || etm->synth_opts.last_branch) {
-		/*
-		 * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for
-		 * the next incoming packet.
-		 */
-		tmp = tidq->packet;
-		tidq->packet = tidq->prev_packet;
-		tidq->prev_packet = tmp;
-	}
+	cs_etm__packet_swap(etm, tidq);
 
 	return 0;
 }
@@ -1441,7 +1503,6 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
 {
 	int err = 0;
 	struct cs_etm_auxtrace *etm = etmq->etm;
-	struct cs_etm_packet *tmp;
 
 	/* Handle start tracing packet */
 	if (tidq->prev_packet->sample_type == CS_ETM_EMPTY)
@@ -1449,6 +1510,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
 
 	if (etmq->etm->synth_opts.last_branch &&
 	    tidq->prev_packet->sample_type == CS_ETM_RANGE) {
+		u64 addr;
+
+		/* Prepare last branches for instruction sample */
+		cs_etm__copy_last_branch_rb(etmq, tidq);
+
 		/*
 		 * Generate a last branch event for the branches left in the
 		 * circular buffer at the end of the trace.
@@ -1456,7 +1522,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
 		 * Use the address of the end of the last reported execution
 		 * range
 		 */
-		u64 addr = cs_etm__last_executed_instr(tidq->prev_packet);
+		addr = cs_etm__last_executed_instr(tidq->prev_packet);
 
 		err = cs_etm__synth_instruction_sample(
 			etmq, tidq, addr,
@@ -1476,15 +1542,11 @@ static int cs_etm__flush(struct cs_etm_queue *etmq,
 	}
 
 swap_packet:
-	if (etm->sample_branches || etm->synth_opts.last_branch) {
-		/*
-		 * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for
-		 * the next incoming packet.
-		 */
-		tmp = tidq->packet;
-		tidq->packet = tidq->prev_packet;
-		tidq->prev_packet = tmp;
-	}
+	cs_etm__packet_swap(etm, tidq);
+
+	/* Reset last branches after flush the trace */
+	if (etm->synth_opts.last_branch)
+		cs_etm__reset_last_branch_rb(tidq);
 
 	return err;
 }
@@ -1505,11 +1567,16 @@ static int cs_etm__end_block(struct cs_etm_queue *etmq,
 	 */
 	if (etmq->etm->synth_opts.last_branch &&
 	    tidq->prev_packet->sample_type == CS_ETM_RANGE) {
+		u64 addr;
+
+		/* Prepare last branches for instruction sample */
+		cs_etm__copy_last_branch_rb(etmq, tidq);
+
 		/*
 		 * Use the address of the end of the last reported execution
 		 * range.
 		 */
-		u64 addr = cs_etm__last_executed_instr(tidq->prev_packet);
+		addr = cs_etm__last_executed_instr(tidq->prev_packet);
 
 		err = cs_etm__synth_instruction_sample(
 			etmq, tidq, addr,
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 85223159737c..3cda40a2fafc 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -139,6 +139,7 @@ struct perf_sample {
 	u16 insn_len;
 	u8  cpumode;
 	u16 misc;
+	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
 	char insn[MAX_INSN];
 	void *raw_data;
 	struct ip_callchain *callchain;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index c8dc4450884c..816d930d774e 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -712,7 +712,8 @@ static void __perf_evsel__config_callchain(struct evsel *evsel,
 				attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER |
 							PERF_SAMPLE_BRANCH_CALL_STACK |
 							PERF_SAMPLE_BRANCH_NO_CYCLES |
-							PERF_SAMPLE_BRANCH_NO_FLAGS;
+							PERF_SAMPLE_BRANCH_NO_FLAGS |
+							PERF_SAMPLE_BRANCH_HW_INDEX;
 			}
 		} else
 			 pr_warning("Cannot use LBR callstack with branch stack. "
@@ -763,7 +764,8 @@ perf_evsel__reset_callgraph(struct evsel *evsel,
 	if (param->record_mode == CALLCHAIN_LBR) {
 		perf_evsel__reset_sample_bit(evsel, BRANCH_STACK);
 		attr->branch_sample_type &= ~(PERF_SAMPLE_BRANCH_USER |
-					      PERF_SAMPLE_BRANCH_CALL_STACK);
+					      PERF_SAMPLE_BRANCH_CALL_STACK |
+					      PERF_SAMPLE_BRANCH_HW_INDEX);
 	}
 	if (param->record_mode == CALLCHAIN_DWARF) {
 		perf_evsel__reset_sample_bit(evsel, REGS_USER);
@@ -1673,6 +1675,8 @@ fallback_missing_features:
 		evsel->core.attr.ksymbol = 0;
 	if (perf_missing_features.bpf)
 		evsel->core.attr.bpf_event = 0;
+	if (perf_missing_features.branch_hw_idx)
+		evsel->core.attr.branch_sample_type &= ~PERF_SAMPLE_BRANCH_HW_INDEX;
 retry_sample_id:
 	if (perf_missing_features.sample_id_all)
 		evsel->core.attr.sample_id_all = 0;
@@ -1784,7 +1788,12 @@ try_fallback:
 	 * Must probe features in the order they were added to the
 	 * perf_event_attr interface.
 	 */
-	if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) {
+	if (!perf_missing_features.branch_hw_idx &&
+	    (evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX)) {
+		perf_missing_features.branch_hw_idx = true;
+		pr_debug2("switching off branch HW index support\n");
+		goto fallback_missing_features;
+	} else if (!perf_missing_features.aux_output && evsel->core.attr.aux_output) {
 		perf_missing_features.aux_output = true;
 		pr_debug2_peo("Kernel has no attr.aux_output support, bailing out\n");
 		goto out_close;
@@ -2169,7 +2178,12 @@ int perf_evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 
 		if (data->branch_stack->nr > max_branch_nr)
 			return -EFAULT;
+
 		sz = data->branch_stack->nr * sizeof(struct branch_entry);
+		if (perf_evsel__has_branch_hw_idx(evsel))
+			sz += sizeof(u64);
+		else
+			data->no_hw_idx = true;
 		OVERFLOW_CHECK(array, sz, max_size);
 		array = (void *)array + sz;
 	}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index dc14f4a823cd..33804740e2ca 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -119,6 +119,7 @@ struct perf_missing_features {
 	bool ksymbol;
 	bool bpf;
 	bool aux_output;
+	bool branch_hw_idx;
 };
 
 extern struct perf_missing_features perf_missing_features;
@@ -389,6 +390,11 @@ static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel)
 	return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
 }
 
+static inline bool perf_evsel__has_branch_hw_idx(const struct evsel *evsel)
+{
+	return evsel->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
 static inline bool evsel__has_callchain(const struct evsel *evsel)
 {
 	return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0;
diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
new file mode 100644
index 000000000000..fd192ddf93c1
--- /dev/null
+++ b/tools/perf/util/expr.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdbool.h>
+#include <assert.h>
+#include "expr.h"
+#include "expr-bison.h"
+#define YY_EXTRA_TYPE int
+#include "expr-flex.h"
+
+#ifdef PARSER_DEBUG
+extern int expr_debug;
+#endif
+
+/* Caller must make sure id is allocated */
+void expr__add_id(struct parse_ctx *ctx, const char *name, double val)
+{
+	int idx;
+
+	assert(ctx->num_ids < MAX_PARSE_ID);
+	idx = ctx->num_ids++;
+	ctx->ids[idx].name = name;
+	ctx->ids[idx].val = val;
+}
+
+void expr__ctx_init(struct parse_ctx *ctx)
+{
+	ctx->num_ids = 0;
+}
+
+static int
+__expr__parse(double *val, struct parse_ctx *ctx, const char *expr,
+	      int start)
+{
+	YY_BUFFER_STATE buffer;
+	void *scanner;
+	int ret;
+
+	ret = expr_lex_init_extra(start, &scanner);
+	if (ret)
+		return ret;
+
+	buffer = expr__scan_string(expr, scanner);
+
+#ifdef PARSER_DEBUG
+	expr_debug = 1;
+#endif
+
+	ret = expr_parse(val, ctx, scanner);
+
+	expr__flush_buffer(buffer, scanner);
+	expr__delete_buffer(buffer, scanner);
+	expr_lex_destroy(scanner);
+	return ret;
+}
+
+int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr)
+{
+	return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0;
+}
+
+static bool
+already_seen(const char *val, const char *one, const char **other,
+	     int num_other)
+{
+	int i;
+
+	if (one && !strcasecmp(one, val))
+		return true;
+	for (i = 0; i < num_other; i++)
+		if (!strcasecmp(other[i], val))
+			return true;
+	return false;
+}
+
+int expr__find_other(const char *expr, const char *one, const char ***other,
+		     int *num_other)
+{
+	int err, i = 0, j = 0;
+	struct parse_ctx ctx;
+
+	expr__ctx_init(&ctx);
+	err = __expr__parse(NULL, &ctx, expr, EXPR_OTHER);
+	if (err)
+		return -1;
+
+	*other = malloc((ctx.num_ids + 1) * sizeof(char *));
+	if (!*other)
+		return -ENOMEM;
+
+	for (i = 0, j = 0; i < ctx.num_ids; i++) {
+		const char *str = ctx.ids[i].name;
+
+		if (already_seen(str, one, *other, j))
+			continue;
+
+		str = strdup(str);
+		if (!str)
+			goto out;
+		(*other)[j++] = str;
+	}
+	(*other)[j] = NULL;
+
+out:
+	if (i != ctx.num_ids) {
+		while (--j)
+			free((char *) (*other)[i]);
+		free(*other);
+		err = -1;
+	}
+
+	*num_other = j;
+	return err;
+}
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index 046160831f90..9377538f4097 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -2,7 +2,7 @@
 #ifndef PARSE_CTX_H
 #define PARSE_CTX_H 1
 
-#define EXPR_MAX_OTHER 15
+#define EXPR_MAX_OTHER 20
 #define MAX_PARSE_ID EXPR_MAX_OTHER
 
 struct parse_id {
@@ -17,10 +17,8 @@ struct parse_ctx {
 
 void expr__ctx_init(struct parse_ctx *ctx);
 void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
-#ifndef IN_EXPR_Y
-int expr__parse(double *final_val, struct parse_ctx *ctx, const char **pp);
-#endif
-int expr__find_other(const char *p, const char *one, const char ***other,
+int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr);
+int expr__find_other(const char *expr, const char *one, const char ***other,
 		int *num_other);
 
 #endif
diff --git a/tools/perf/util/expr.l b/tools/perf/util/expr.l
new file mode 100644
index 000000000000..eaad29243c23
--- /dev/null
+++ b/tools/perf/util/expr.l
@@ -0,0 +1,114 @@
+%option prefix="expr_"
+%option reentrant
+%option bison-bridge
+
+%{
+#include <linux/compiler.h>
+#include "expr.h"
+#include "expr-bison.h"
+
+char *expr_get_text(yyscan_t yyscanner);
+YYSTYPE *expr_get_lval(yyscan_t yyscanner);
+
+static int __value(YYSTYPE *yylval, char *str, int base, int token)
+{
+	u64 num;
+
+	errno = 0;
+	num = strtoull(str, NULL, base);
+	if (errno)
+		return EXPR_ERROR;
+
+	yylval->num = num;
+	return token;
+}
+
+static int value(yyscan_t scanner, int base)
+{
+	YYSTYPE *yylval = expr_get_lval(scanner);
+	char *text = expr_get_text(scanner);
+
+	return __value(yylval, text, base, NUMBER);
+}
+
+/*
+ * Allow @ instead of / to be able to specify pmu/event/ without
+ * conflicts with normal division.
+ */
+static char *normalize(char *str)
+{
+	char *ret = str;
+	char *dst = str;
+
+	while (*str) {
+		if (*str == '@')
+			*dst++ = '/';
+		else if (*str == '\\')
+			*dst++ = *++str;
+		else
+			*dst++ = *str;
+		str++;
+	}
+
+	*dst = 0x0;
+	return ret;
+}
+
+static int str(yyscan_t scanner, int token)
+{
+	YYSTYPE *yylval = expr_get_lval(scanner);
+	char *text = expr_get_text(scanner);
+
+	yylval->str = normalize(strdup(text));
+	if (!yylval->str)
+		return EXPR_ERROR;
+
+	yylval->str = normalize(yylval->str);
+	return token;
+}
+%}
+
+number		[0-9]+
+
+sch		[-,=]
+spec		\\{sch}
+sym		[0-9a-zA-Z_\.:@]+
+symbol		{spec}*{sym}*{spec}*{sym}*
+
+%%
+	{
+		int start_token;
+
+		start_token = expr_get_extra(yyscanner);
+
+		if (start_token) {
+			expr_set_extra(NULL, yyscanner);
+			return start_token;
+		}
+	}
+
+max		{ return MAX; }
+min		{ return MIN; }
+if		{ return IF; }
+else		{ return ELSE; }
+#smt_on		{ return SMT_ON; }
+{number}	{ return value(yyscanner, 10); }
+{symbol}	{ return str(yyscanner, ID); }
+"|"		{ return '|'; }
+"^"		{ return '^'; }
+"&"		{ return '&'; }
+"-"		{ return '-'; }
+"+"		{ return '+'; }
+"*"		{ return '*'; }
+"/"		{ return '/'; }
+"%"		{ return '%'; }
+"("		{ return '('; }
+")"		{ return ')'; }
+","		{ return ','; }
+.		{ }
+%%
+
+int expr_wrap(void *scanner __maybe_unused)
+{
+	return 1;
+}
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
index 7d226241f1d7..4720cbe79357 100644
--- a/tools/perf/util/expr.y
+++ b/tools/perf/util/expr.y
@@ -1,31 +1,32 @@
 /* Simple expression parser */
 %{
+#define YYDEBUG 1
+#include <stdio.h>
 #include "util.h"
 #include "util/debug.h"
 #include <stdlib.h> // strtod()
 #define IN_EXPR_Y 1
 #include "expr.h"
 #include "smt.h"
-#include <assert.h>
 #include <string.h>
 
-#define MAXIDLEN 256
 %}
 
 %define api.pure full
 
 %parse-param { double *final_val }
 %parse-param { struct parse_ctx *ctx }
-%parse-param { const char **pp }
-%lex-param { const char **pp }
+%parse-param {void *scanner}
+%lex-param {void* scanner}
 
 %union {
-	double num;
-	char id[MAXIDLEN+1];
+	double	 num;
+	char	*str;
 }
 
+%token EXPR_PARSE EXPR_OTHER EXPR_ERROR
 %token <num> NUMBER
-%token <id> ID
+%token <str> ID
 %token MIN MAX IF ELSE SMT_ON
 %left MIN MAX IF
 %left '|'
@@ -37,11 +38,9 @@
 %type <num> expr if_expr
 
 %{
-static int expr__lex(YYSTYPE *res, const char **pp);
-
-static void expr__error(double *final_val __maybe_unused,
+static void expr_error(double *final_val __maybe_unused,
 		       struct parse_ctx *ctx __maybe_unused,
-		       const char **pp __maybe_unused,
+		       void *scanner,
 		       const char *s)
 {
 	pr_debug("%s\n", s);
@@ -63,6 +62,27 @@ static int lookup_id(struct parse_ctx *ctx, char *id, double *val)
 %}
 %%
 
+start:
+EXPR_PARSE all_expr
+|
+EXPR_OTHER all_other
+
+all_other: all_other other
+|
+
+other: ID
+{
+	if (ctx->num_ids + 1 >= EXPR_MAX_OTHER) {
+		pr_err("failed: way too many variables");
+		YYABORT;
+	}
+
+	ctx->ids[ctx->num_ids++].name = $1;
+}
+|
+MIN | MAX | IF | ELSE | SMT_ON | NUMBER | '|' | '^' | '&' | '-' | '+' | '*' | '/' | '%' | '(' | ')'
+
+
 all_expr: if_expr			{ *final_val = $1; }
 	;
 
@@ -93,146 +113,3 @@ expr:	  NUMBER
 	;
 
 %%
-
-static int expr__symbol(YYSTYPE *res, const char *p, const char **pp)
-{
-	char *dst = res->id;
-	const char *s = p;
-
-	if (*p == '#')
-		*dst++ = *p++;
-
-	while (isalnum(*p) || *p == '_' || *p == '.' || *p == ':' || *p == '@' || *p == '\\') {
-		if (p - s >= MAXIDLEN)
-			return -1;
-		/*
-		 * Allow @ instead of / to be able to specify pmu/event/ without
-		 * conflicts with normal division.
-		 */
-		if (*p == '@')
-			*dst++ = '/';
-		else if (*p == '\\')
-			*dst++ = *++p;
-		else
-			*dst++ = *p;
-		p++;
-	}
-	*dst = 0;
-	*pp = p;
-	dst = res->id;
-	switch (dst[0]) {
-	case 'm':
-		if (!strcmp(dst, "min"))
-			return MIN;
-		if (!strcmp(dst, "max"))
-			return MAX;
-		break;
-	case 'i':
-		if (!strcmp(dst, "if"))
-			return IF;
-		break;
-	case 'e':
-		if (!strcmp(dst, "else"))
-			return ELSE;
-		break;
-	case '#':
-		if (!strcasecmp(dst, "#smt_on"))
-			return SMT_ON;
-		break;
-	}
-	return ID;
-}
-
-static int expr__lex(YYSTYPE *res, const char **pp)
-{
-	int tok;
-	const char *s;
-	const char *p = *pp;
-
-	while (isspace(*p))
-		p++;
-	s = p;
-	switch (*p++) {
-	case '#':
-	case 'a' ... 'z':
-	case 'A' ... 'Z':
-		return expr__symbol(res, p - 1, pp);
-	case '0' ... '9': case '.':
-		res->num = strtod(s, (char **)&p);
-		tok = NUMBER;
-		break;
-	default:
-		tok = *s;
-		break;
-	}
-	*pp = p;
-	return tok;
-}
-
-/* Caller must make sure id is allocated */
-void expr__add_id(struct parse_ctx *ctx, const char *name, double val)
-{
-	int idx;
-	assert(ctx->num_ids < MAX_PARSE_ID);
-	idx = ctx->num_ids++;
-	ctx->ids[idx].name = name;
-	ctx->ids[idx].val = val;
-}
-
-void expr__ctx_init(struct parse_ctx *ctx)
-{
-	ctx->num_ids = 0;
-}
-
-static bool already_seen(const char *val, const char *one, const char **other,
-			 int num_other)
-{
-	int i;
-
-	if (one && !strcasecmp(one, val))
-		return true;
-	for (i = 0; i < num_other; i++)
-		if (!strcasecmp(other[i], val))
-			return true;
-	return false;
-}
-
-int expr__find_other(const char *p, const char *one, const char ***other,
-		     int *num_otherp)
-{
-	const char *orig = p;
-	int err = -1;
-	int num_other;
-
-	*other = malloc((EXPR_MAX_OTHER + 1) * sizeof(char *));
-	if (!*other)
-		return -1;
-
-	num_other = 0;
-	for (;;) {
-		YYSTYPE val;
-		int tok = expr__lex(&val, &p);
-		if (tok == 0) {
-			err = 0;
-			break;
-		}
-		if (tok == ID && !already_seen(val.id, one, *other, num_other)) {
-			if (num_other >= EXPR_MAX_OTHER - 1) {
-				pr_debug("Too many extra events in %s\n", orig);
-				break;
-			}
-			(*other)[num_other] = strdup(val.id);
-			if (!(*other)[num_other])
-				return -1;
-			num_other++;
-		}
-	}
-	(*other)[num_other] = NULL;
-	*num_otherp = num_other;
-	if (err) {
-		*num_otherp = 0;
-		free(*other);
-		*other = NULL;
-	}
-	return err;
-}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4246e7447e54..acbd046bf95c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1590,6 +1590,40 @@ static void free_event_desc(struct evsel *events)
 	free(events);
 }
 
+static bool perf_attr_check(struct perf_event_attr *attr)
+{
+	if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) {
+		pr_warning("Reserved bits are set unexpectedly. "
+			   "Please update perf tool.\n");
+		return false;
+	}
+
+	if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) {
+		pr_warning("Unknown sample type (0x%llx) is detected. "
+			   "Please update perf tool.\n",
+			   attr->sample_type);
+		return false;
+	}
+
+	if (attr->read_format & ~(PERF_FORMAT_MAX-1)) {
+		pr_warning("Unknown read format (0x%llx) is detected. "
+			   "Please update perf tool.\n",
+			   attr->read_format);
+		return false;
+	}
+
+	if ((attr->sample_type & PERF_SAMPLE_BRANCH_STACK) &&
+	    (attr->branch_sample_type & ~(PERF_SAMPLE_BRANCH_MAX-1))) {
+		pr_warning("Unknown branch sample type (0x%llx) is detected. "
+			   "Please update perf tool.\n",
+			   attr->branch_sample_type);
+
+		return false;
+	}
+
+	return true;
+}
+
 static struct evsel *read_event_desc(struct feat_fd *ff)
 {
 	struct evsel *evsel, *events = NULL;
@@ -1634,6 +1668,9 @@ static struct evsel *read_event_desc(struct feat_fd *ff)
 
 		memcpy(&evsel->core.attr, buf, msz);
 
+		if (!perf_attr_check(&evsel->core.attr))
+			goto error;
+
 		if (do_read_u32(ff, &nr))
 			goto error;
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index ca5a8f4d007e..e74a5acf66d9 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -2584,9 +2584,10 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
 			  u64 *total_cycles)
 {
 	struct branch_info *bi;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 
 	/* If we have branch cycles always annotate them. */
-	if (bs && bs->nr && bs->entries[0].flags.cycles) {
+	if (bs && bs->nr && entries[0].flags.cycles) {
 		int i;
 
 		bi = sample__resolve_bstack(sample, al);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 33cf8928cf05..23c8289c2472 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -1295,6 +1295,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
 	struct perf_sample sample = { .ip = 0, };
 	struct dummy_branch_stack {
 		u64			nr;
+		u64			hw_idx;
 		struct branch_entry	entries;
 	} dummy_bs;
 
@@ -1316,6 +1317,7 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
 	if (pt->synth_opts.last_branch && sort__mode == SORT_MODE__BRANCH) {
 		dummy_bs = (struct dummy_branch_stack){
 			.nr = 1,
+			.hw_idx = -1ULL,
 			.entries = {
 				.from = sample.ip,
 				.to = sample.addr,
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index b5af680fc667..dbdffb6673fe 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -265,6 +265,8 @@ static int detect_kbuild_dir(char **kbuild_dir)
 			return -ENOMEM;
 		return 0;
 	}
+	pr_debug("%s: Couldn't find \"%s\", missing kernel-devel package?.\n",
+		 __func__, autoconf_path);
 	free(autoconf_path);
 	return -ENOENT;
 }
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index fb5c2cd44d30..fd14f1489802 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2081,15 +2081,16 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
 {
 	unsigned int i;
 	const struct branch_stack *bs = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	struct branch_info *bi = calloc(bs->nr, sizeof(struct branch_info));
 
 	if (!bi)
 		return NULL;
 
 	for (i = 0; i < bs->nr; i++) {
-		ip__resolve_ams(al->thread, &bi[i].to, bs->entries[i].to);
-		ip__resolve_ams(al->thread, &bi[i].from, bs->entries[i].from);
-		bi[i].flags = bs->entries[i].flags;
+		ip__resolve_ams(al->thread, &bi[i].to, entries[i].to);
+		ip__resolve_ams(al->thread, &bi[i].from, entries[i].from);
+		bi[i].flags = entries[i].flags;
 	}
 	return bi;
 }
@@ -2185,6 +2186,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 	/* LBR only affects the user callchain */
 	if (i != chain_nr) {
 		struct branch_stack *lbr_stack = sample->branch_stack;
+		struct branch_entry *entries = perf_sample__branch_entries(sample);
 		int lbr_nr = lbr_stack->nr, j, k;
 		bool branch;
 		struct branch_flags *flags;
@@ -2210,31 +2212,29 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 					ip = chain->ips[j];
 				else if (j > i + 1) {
 					k = j - i - 2;
-					ip = lbr_stack->entries[k].from;
+					ip = entries[k].from;
 					branch = true;
-					flags = &lbr_stack->entries[k].flags;
+					flags = &entries[k].flags;
 				} else {
-					ip = lbr_stack->entries[0].to;
+					ip = entries[0].to;
 					branch = true;
-					flags = &lbr_stack->entries[0].flags;
-					branch_from =
-						lbr_stack->entries[0].from;
+					flags = &entries[0].flags;
+					branch_from = entries[0].from;
 				}
 			} else {
 				if (j < lbr_nr) {
 					k = lbr_nr - j - 1;
-					ip = lbr_stack->entries[k].from;
+					ip = entries[k].from;
 					branch = true;
-					flags = &lbr_stack->entries[k].flags;
+					flags = &entries[k].flags;
 				}
 				else if (j > lbr_nr)
 					ip = chain->ips[i + 1 - (j - lbr_nr)];
 				else {
-					ip = lbr_stack->entries[0].to;
+					ip = entries[0].to;
 					branch = true;
-					flags = &lbr_stack->entries[0].flags;
-					branch_from =
-						lbr_stack->entries[0].from;
+					flags = &entries[0].flags;
+					branch_from = entries[0].from;
 				}
 			}
 
@@ -2281,6 +2281,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 					    int max_stack)
 {
 	struct branch_stack *branch = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	struct ip_callchain *chain = sample->callchain;
 	int chain_nr = 0;
 	u8 cpumode = PERF_RECORD_MISC_USER;
@@ -2328,7 +2329,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 
 		for (i = 0; i < nr; i++) {
 			if (callchain_param.order == ORDER_CALLEE) {
-				be[i] = branch->entries[i];
+				be[i] = entries[i];
 
 				if (chain == NULL)
 					continue;
@@ -2347,7 +2348,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 				    be[i].from >= chain->ips[first_call] - 8)
 					first_call++;
 			} else
-				be[i] = branch->entries[branch->nr - i - 1];
+				be[i] = entries[branch->nr - i - 1];
 		}
 
 		memset(iter, 0, sizeof(struct iterations) * nr);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index b342f744b1fc..53d96611e6a6 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -44,8 +44,8 @@ static inline int is_no_dso_memory(const char *filename)
 
 static inline int is_android_lib(const char *filename)
 {
-	return !strncmp(filename, "/data/app-lib", 13) ||
-	       !strncmp(filename, "/system/lib", 11);
+	return strstarts(filename, "/data/app-lib/") ||
+	       strstarts(filename, "/system/lib/");
 }
 
 static inline bool replace_android_lib(const char *filename, char *newfilename)
@@ -65,7 +65,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename)
 
 	app_abi_length = strlen(app_abi);
 
-	if (!strncmp(filename, "/data/app-lib", 13)) {
+	if (strstarts(filename, "/data/app-lib/")) {
 		char *apk_path;
 
 		if (!app_abi_length)
@@ -89,7 +89,7 @@ static inline bool replace_android_lib(const char *filename, char *newfilename)
 		return true;
 	}
 
-	if (!strncmp(filename, "/system/lib/", 12)) {
+	if (strstarts(filename, "/system/lib/")) {
 		char *ndk, *app;
 		const char *arch;
 		size_t ndk_length;
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 02aee946b6c1..c3a8c701609a 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -22,6 +22,8 @@
 #include <linux/string.h>
 #include <linux/zalloc.h>
 #include <subcmd/parse-options.h>
+#include <api/fs/fs.h>
+#include "util.h"
 
 struct metric_event *metricgroup__lookup(struct rblist *metric_events,
 					 struct evsel *evsel,
@@ -399,13 +401,85 @@ void metricgroup__print(bool metrics, bool metricgroups, char *filter,
 	strlist__delete(metriclist);
 }
 
+static void metricgroup__add_metric_weak_group(struct strbuf *events,
+					       const char **ids,
+					       int idnum)
+{
+	bool no_group = false;
+	int i;
+
+	for (i = 0; i < idnum; i++) {
+		pr_debug("found event %s\n", ids[i]);
+		/*
+		 * Duration time maps to a software event and can make
+		 * groups not count. Always use it outside a
+		 * group.
+		 */
+		if (!strcmp(ids[i], "duration_time")) {
+			if (i > 0)
+				strbuf_addf(events, "}:W,");
+			strbuf_addf(events, "duration_time");
+			no_group = true;
+			continue;
+		}
+		strbuf_addf(events, "%s%s",
+			i == 0 || no_group ? "{" : ",",
+			ids[i]);
+		no_group = false;
+	}
+	if (!no_group)
+		strbuf_addf(events, "}:W");
+}
+
+static void metricgroup__add_metric_non_group(struct strbuf *events,
+					      const char **ids,
+					      int idnum)
+{
+	int i;
+
+	for (i = 0; i < idnum; i++)
+		strbuf_addf(events, ",%s", ids[i]);
+}
+
+static void metricgroup___watchdog_constraint_hint(const char *name, bool foot)
+{
+	static bool violate_nmi_constraint;
+
+	if (!foot) {
+		pr_warning("Splitting metric group %s into standalone metrics.\n", name);
+		violate_nmi_constraint = true;
+		return;
+	}
+
+	if (!violate_nmi_constraint)
+		return;
+
+	pr_warning("Try disabling the NMI watchdog to comply NO_NMI_WATCHDOG metric constraint:\n"
+		   "    echo 0 > /proc/sys/kernel/nmi_watchdog\n"
+		   "    perf stat ...\n"
+		   "    echo 1 > /proc/sys/kernel/nmi_watchdog\n");
+}
+
+static bool metricgroup__has_constraint(struct pmu_event *pe)
+{
+	if (!pe->metric_constraint)
+		return false;
+
+	if (!strcmp(pe->metric_constraint, "NO_NMI_WATCHDOG") &&
+	    sysctl__nmi_watchdog_enabled()) {
+		metricgroup___watchdog_constraint_hint(pe->metric_name, false);
+		return true;
+	}
+
+	return false;
+}
+
 static int metricgroup__add_metric(const char *metric, struct strbuf *events,
 				   struct list_head *group_list)
 {
 	struct pmu_events_map *map = perf_pmu__find_map(NULL);
 	struct pmu_event *pe;
-	int ret = -EINVAL;
-	int i, j;
+	int i, ret = -EINVAL;
 
 	if (!map)
 		return 0;
@@ -422,7 +496,6 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events,
 			const char **ids;
 			int idnum;
 			struct egroup *eg;
-			bool no_group = false;
 
 			pr_debug("metric expr %s for %s\n", pe->metric_expr, pe->metric_name);
 
@@ -431,27 +504,11 @@ static int metricgroup__add_metric(const char *metric, struct strbuf *events,
 				continue;
 			if (events->len > 0)
 				strbuf_addf(events, ",");
-			for (j = 0; j < idnum; j++) {
-				pr_debug("found event %s\n", ids[j]);
-				/*
-				 * Duration time maps to a software event and can make
-				 * groups not count. Always use it outside a
-				 * group.
-				 */
-				if (!strcmp(ids[j], "duration_time")) {
-					if (j > 0)
-						strbuf_addf(events, "}:W,");
-					strbuf_addf(events, "duration_time");
-					no_group = true;
-					continue;
-				}
-				strbuf_addf(events, "%s%s",
-					j == 0 || no_group ? "{" : ",",
-					ids[j]);
-				no_group = false;
-			}
-			if (!no_group)
-				strbuf_addf(events, "}:W");
+
+			if (metricgroup__has_constraint(pe))
+				metricgroup__add_metric_non_group(events, ids, idnum);
+			else
+				metricgroup__add_metric_weak_group(events, ids, idnum);
 
 			eg = malloc(sizeof(struct egroup));
 			if (!eg) {
@@ -493,6 +550,10 @@ static int metricgroup__add_metric_list(const char *list, struct strbuf *events,
 		}
 	}
 	free(nlist);
+
+	if (!ret)
+		metricgroup___watchdog_constraint_hint(NULL, true);
+
 	return ret;
 }
 
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 3b664fa673a6..ab7108d22428 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -98,20 +98,29 @@ static int perf_mmap__aio_bind(struct mmap *map, int idx, int cpu, int affinity)
 {
 	void *data;
 	size_t mmap_len;
-	unsigned long node_mask;
+	unsigned long *node_mask;
+	unsigned long node_index;
+	int err = 0;
 
 	if (affinity != PERF_AFFINITY_SYS && cpu__max_node() > 1) {
 		data = map->aio.data[idx];
 		mmap_len = mmap__mmap_len(map);
-		node_mask = 1UL << cpu__get_node(cpu);
-		if (mbind(data, mmap_len, MPOL_BIND, &node_mask, 1, 0)) {
-			pr_err("Failed to bind [%p-%p] AIO buffer to node %d: error %m\n",
-				data, data + mmap_len, cpu__get_node(cpu));
+		node_index = cpu__get_node(cpu);
+		node_mask = bitmap_alloc(node_index + 1);
+		if (!node_mask) {
+			pr_err("Failed to allocate node mask for mbind: error %m\n");
 			return -1;
 		}
+		set_bit(node_index, node_mask);
+		if (mbind(data, mmap_len, MPOL_BIND, node_mask, node_index + 1 + 1, 0)) {
+			pr_err("Failed to bind [%p-%p] AIO buffer to node %lu: error %m\n",
+				data, data + mmap_len, node_index);
+			err = -1;
+		}
+		bitmap_free(node_mask);
 	}
 
-	return 0;
+	return err;
 }
 #else /* !HAVE_LIBNUMA_SUPPORT */
 static int perf_mmap__aio_alloc(struct mmap *map, int idx)
diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c
index 651203126c71..355d3458d4e6 100644
--- a/tools/perf/util/perf_event_attr_fprintf.c
+++ b/tools/perf/util/perf_event_attr_fprintf.c
@@ -50,6 +50,7 @@ static void __p_branch_sample_type(char *buf, size_t size, u64 value)
 		bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX),
 		bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP),
 		bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES),
+		bit_name(HW_INDEX),
 		{ .name = NULL, }
 	};
 #undef bit_name
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 80ca5d0ab7fe..8c1b27cd8b99 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -464,6 +464,7 @@ static PyObject *python_process_brstack(struct perf_sample *sample,
 					struct thread *thread)
 {
 	struct branch_stack *br = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	PyObject *pylist;
 	u64 i;
 
@@ -484,28 +485,28 @@ static PyObject *python_process_brstack(struct perf_sample *sample,
 			Py_FatalError("couldn't create Python dictionary");
 
 		pydict_set_item_string_decref(pyelem, "from",
-		    PyLong_FromUnsignedLongLong(br->entries[i].from));
+		    PyLong_FromUnsignedLongLong(entries[i].from));
 		pydict_set_item_string_decref(pyelem, "to",
-		    PyLong_FromUnsignedLongLong(br->entries[i].to));
+		    PyLong_FromUnsignedLongLong(entries[i].to));
 		pydict_set_item_string_decref(pyelem, "mispred",
-		    PyBool_FromLong(br->entries[i].flags.mispred));
+		    PyBool_FromLong(entries[i].flags.mispred));
 		pydict_set_item_string_decref(pyelem, "predicted",
-		    PyBool_FromLong(br->entries[i].flags.predicted));
+		    PyBool_FromLong(entries[i].flags.predicted));
 		pydict_set_item_string_decref(pyelem, "in_tx",
-		    PyBool_FromLong(br->entries[i].flags.in_tx));
+		    PyBool_FromLong(entries[i].flags.in_tx));
 		pydict_set_item_string_decref(pyelem, "abort",
-		    PyBool_FromLong(br->entries[i].flags.abort));
+		    PyBool_FromLong(entries[i].flags.abort));
 		pydict_set_item_string_decref(pyelem, "cycles",
-		    PyLong_FromUnsignedLongLong(br->entries[i].flags.cycles));
+		    PyLong_FromUnsignedLongLong(entries[i].flags.cycles));
 
 		thread__find_map_fb(thread, sample->cpumode,
-				    br->entries[i].from, &al);
+				    entries[i].from, &al);
 		dsoname = get_dsoname(al.map);
 		pydict_set_item_string_decref(pyelem, "from_dsoname",
 					      _PyUnicode_FromString(dsoname));
 
 		thread__find_map_fb(thread, sample->cpumode,
-				    br->entries[i].to, &al);
+				    entries[i].to, &al);
 		dsoname = get_dsoname(al.map);
 		pydict_set_item_string_decref(pyelem, "to_dsoname",
 					      _PyUnicode_FromString(dsoname));
@@ -561,6 +562,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample,
 					   struct thread *thread)
 {
 	struct branch_stack *br = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	PyObject *pylist;
 	u64 i;
 	char bf[512];
@@ -581,22 +583,22 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample,
 			Py_FatalError("couldn't create Python dictionary");
 
 		thread__find_symbol_fb(thread, sample->cpumode,
-				       br->entries[i].from, &al);
+				       entries[i].from, &al);
 		get_symoff(al.sym, &al, true, bf, sizeof(bf));
 		pydict_set_item_string_decref(pyelem, "from",
 					      _PyUnicode_FromString(bf));
 
 		thread__find_symbol_fb(thread, sample->cpumode,
-				       br->entries[i].to, &al);
+				       entries[i].to, &al);
 		get_symoff(al.sym, &al, true, bf, sizeof(bf));
 		pydict_set_item_string_decref(pyelem, "to",
 					      _PyUnicode_FromString(bf));
 
-		get_br_mspred(&br->entries[i].flags, bf, sizeof(bf));
+		get_br_mspred(&entries[i].flags, bf, sizeof(bf));
 		pydict_set_item_string_decref(pyelem, "pred",
 					      _PyUnicode_FromString(bf));
 
-		if (br->entries[i].flags.in_tx) {
+		if (entries[i].flags.in_tx) {
 			pydict_set_item_string_decref(pyelem, "in_tx",
 					      _PyUnicode_FromString("X"));
 		} else {
@@ -604,7 +606,7 @@ static PyObject *python_process_brstacksym(struct perf_sample *sample,
 					      _PyUnicode_FromString("-"));
 		}
 
-		if (br->entries[i].flags.abort) {
+		if (entries[i].flags.abort) {
 			pydict_set_item_string_decref(pyelem, "abort",
 					      _PyUnicode_FromString("A"));
 		} else {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index d0d7d25b23e3..055b00abd56d 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1007,6 +1007,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 {
 	struct ip_callchain *callchain = sample->callchain;
 	struct branch_stack *lbr_stack = sample->branch_stack;
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	u64 kernel_callchain_nr = callchain->nr;
 	unsigned int i;
 
@@ -1043,10 +1044,10 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 			       i, callchain->ips[i]);
 
 		printf("..... %2d: %016" PRIx64 "\n",
-		       (int)(kernel_callchain_nr), lbr_stack->entries[0].to);
+		       (int)(kernel_callchain_nr), entries[0].to);
 		for (i = 0; i < lbr_stack->nr; i++)
 			printf("..... %2d: %016" PRIx64 "\n",
-			       (int)(i + kernel_callchain_nr + 1), lbr_stack->entries[i].from);
+			       (int)(i + kernel_callchain_nr + 1), entries[i].from);
 	}
 }
 
@@ -1068,6 +1069,7 @@ static void callchain__printf(struct evsel *evsel,
 
 static void branch_stack__printf(struct perf_sample *sample, bool callstack)
 {
+	struct branch_entry *entries = perf_sample__branch_entries(sample);
 	uint64_t i;
 
 	printf("%s: nr:%" PRIu64 "\n",
@@ -1075,7 +1077,7 @@ static void branch_stack__printf(struct perf_sample *sample, bool callstack)
 		sample->branch_stack->nr);
 
 	for (i = 0; i < sample->branch_stack->nr; i++) {
-		struct branch_entry *e = &sample->branch_stack->entries[i];
+		struct branch_entry *e = &entries[i];
 
 		if (!callstack) {
 			printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 " %hu cycles %s%s%s%s %x\n",
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index bc31fccc0057..76c6052b12e2 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -16,6 +16,7 @@
 #include <linux/ctype.h>
 #include "cgroup.h"
 #include <api/fs/fs.h>
+#include "util.h"
 
 #define CNTR_NOT_SUPPORTED	"<not supported>"
 #define CNTR_NOT_COUNTED	"<not counted>"
@@ -110,7 +111,7 @@ static void aggr_printout(struct perf_stat_config *config,
 			config->csv_sep);
 			break;
 	case AGGR_NONE:
-		if (evsel->percore) {
+		if (evsel->percore && !config->percore_show_thread) {
 			fprintf(config->output, "S%d-D%d-C%*d%s",
 				cpu_map__id_to_socket(id),
 				cpu_map__id_to_die(id),
@@ -628,7 +629,7 @@ static void aggr_cb(struct perf_stat_config *config,
 static void print_counter_aggrdata(struct perf_stat_config *config,
 				   struct evsel *counter, int s,
 				   char *prefix, bool metric_only,
-				   bool *first)
+				   bool *first, int cpu)
 {
 	struct aggr_data ad;
 	FILE *output = config->output;
@@ -654,7 +655,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 		fprintf(output, "%s", prefix);
 
 	uval = val * counter->scale;
-	printout(config, id, nr, counter, uval, prefix,
+	printout(config, cpu != -1 ? cpu : id, nr, counter, uval, prefix,
 		 run, ena, 1.0, &rt_stat);
 	if (!metric_only)
 		fputc('\n', output);
@@ -687,7 +688,7 @@ static void print_aggr(struct perf_stat_config *config,
 		evlist__for_each_entry(evlist, counter) {
 			print_counter_aggrdata(config, counter, s,
 					       prefix, metric_only,
-					       &first);
+					       &first, -1);
 		}
 		if (metric_only)
 			fputc('\n', output);
@@ -1097,7 +1098,6 @@ static void print_footer(struct perf_stat_config *config)
 {
 	double avg = avg_stats(config->walltime_nsecs_stats) / NSEC_PER_SEC;
 	FILE *output = config->output;
-	int n;
 
 	if (!config->null_run)
 		fprintf(output, "\n");
@@ -1131,9 +1131,7 @@ static void print_footer(struct perf_stat_config *config)
 	}
 	fprintf(output, "\n\n");
 
-	if (config->print_free_counters_hint &&
-	    sysctl__read_int("kernel/nmi_watchdog", &n) >= 0 &&
-	    n > 0)
+	if (config->print_free_counters_hint && sysctl__nmi_watchdog_enabled())
 		fprintf(output,
 "Some events weren't counted. Try disabling the NMI watchdog:\n"
 "	echo 0 > /proc/sys/kernel/nmi_watchdog\n"
@@ -1146,6 +1144,26 @@ static void print_footer(struct perf_stat_config *config)
 			"the same PMU. Try reorganizing the group.\n");
 }
 
+static void print_percore_thread(struct perf_stat_config *config,
+				 struct evsel *counter, char *prefix)
+{
+	int s, s2, id;
+	bool first = true;
+
+	for (int i = 0; i < perf_evsel__nr_cpus(counter); i++) {
+		s2 = config->aggr_get_id(config, evsel__cpus(counter), i);
+		for (s = 0; s < config->aggr_map->nr; s++) {
+			id = config->aggr_map->map[s];
+			if (s2 == id)
+				break;
+		}
+
+		print_counter_aggrdata(config, counter, s,
+				       prefix, false,
+				       &first, i);
+	}
+}
+
 static void print_percore(struct perf_stat_config *config,
 			  struct evsel *counter, char *prefix)
 {
@@ -1157,13 +1175,16 @@ static void print_percore(struct perf_stat_config *config,
 	if (!(config->aggr_map || config->aggr_get_id))
 		return;
 
+	if (config->percore_show_thread)
+		return print_percore_thread(config, counter, prefix);
+
 	for (s = 0; s < config->aggr_map->nr; s++) {
 		if (prefix && metric_only)
 			fprintf(output, "%s", prefix);
 
 		print_counter_aggrdata(config, counter, s,
 				       prefix, metric_only,
-				       &first);
+				       &first, -1);
 	}
 
 	if (metric_only)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 90d23cc3c8d4..0fd713d3674f 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -777,9 +777,7 @@ static void generic_metric(struct perf_stat_config *config,
 	}
 
 	if (!metric_events[i]) {
-		const char *p = metric_expr;
-
-		if (expr__parse(&ratio, &pctx, &p) == 0) {
+		if (expr__parse(&ratio, &pctx, metric_expr) == 0) {
 			char *unit;
 			char metric_bf[64];
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index fb990efa54a8..b4fdfaa7f2c0 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -109,6 +109,7 @@ struct perf_stat_config {
 	bool			 walltime_run_table;
 	bool			 all_kernel;
 	bool			 all_user;
+	bool			 percore_show_thread;
 	FILE			*output;
 	unsigned int		 interval;
 	unsigned int		 timeout;
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index c423298fe62d..3f28af39f9c6 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -345,6 +345,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 			continue;
 
 		event->mmap2.ino = (u64)ino;
+		event->mmap2.ino_generation = 0;
 
 		/*
 		 * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c
@@ -1183,7 +1184,8 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
 
 	if (type & PERF_SAMPLE_BRANCH_STACK) {
 		sz = sample->branch_stack->nr * sizeof(struct branch_entry);
-		sz += sizeof(u64);
+		/* nr, hw_idx */
+		sz += 2 * sizeof(u64);
 		result += sz;
 	}
 
@@ -1344,7 +1346,8 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo
 
 	if (type & PERF_SAMPLE_BRANCH_STACK) {
 		sz = sample->branch_stack->nr * sizeof(struct branch_entry);
-		sz += sizeof(u64);
+		/* nr, hw_idx */
+		sz += 2 * sizeof(u64);
 		memcpy(array, sample->branch_stack, sz);
 		array = (void *)array + sz;
 	}
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 969ae560dad9..d707c9624dd9 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -55,6 +55,24 @@ int sysctl__max_stack(void)
 	return sysctl_perf_event_max_stack;
 }
 
+bool sysctl__nmi_watchdog_enabled(void)
+{
+	static bool cached;
+	static bool nmi_watchdog;
+	int value;
+
+	if (cached)
+		return nmi_watchdog;
+
+	if (sysctl__read_int("kernel/nmi_watchdog", &value) < 0)
+		return false;
+
+	nmi_watchdog = (value > 0) ? true : false;
+	cached = true;
+
+	return nmi_watchdog;
+}
+
 bool test_attr__enabled;
 
 bool perf_host  = true;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 9969b8b46f7c..f486fdd3a538 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -29,6 +29,8 @@ size_t hex_width(u64 v);
 
 int sysctl__max_stack(void);
 
+bool sysctl__nmi_watchdog_enabled(void);
+
 int fetch_kernel_version(unsigned int *puint,
 			 char *str, size_t str_sz);
 #define KVER_VERSION(x)		(((x) >> 16) & 0xff)