summary refs log tree commit diff
path: root/tools
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2012-03-12 20:46:35 +0100
committerIngo Molnar <mingo@elte.hu>2012-03-12 20:47:05 +0100
commitbea95c152dee1791dd02cbc708afbb115bb00f9a (patch)
treeaf9994c42c5fdd81ba3dadd7b812e2fa85273353 /tools
parentf9b4eeb809c6d031cc9561cc34dd691701cb2c2a (diff)
parent24bff2dc0f77b1f186b7bdf30060caf3df191a68 (diff)
downloadlinux-bea95c152dee1791dd02cbc708afbb115bb00f9a.tar.gz
Merge branch 'perf/hw-branch-sampling' into perf/core
Merge reason: The 'perf record -b' hardware branch sampling feature is ready for upstream.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'tools')
-rw-r--r--tools/perf/Documentation/perf-record.txt30
-rw-r--r--tools/perf/Documentation/perf-report.txt10
-rw-r--r--tools/perf/builtin-record.c95
-rw-r--r--tools/perf/builtin-report.c178
-rw-r--r--tools/perf/perf.h18
-rw-r--r--tools/perf/util/event.h1
-rw-r--r--tools/perf/util/evsel.c14
-rw-r--r--tools/perf/util/header.c207
-rw-r--r--tools/perf/util/header.h2
-rw-r--r--tools/perf/util/hist.c122
-rw-r--r--tools/perf/util/hist.h11
-rw-r--r--tools/perf/util/session.c77
-rw-r--r--tools/perf/util/session.h4
-rw-r--r--tools/perf/util/sort.c287
-rw-r--r--tools/perf/util/sort.h11
-rw-r--r--tools/perf/util/symbol.h20
-rw-r--r--tools/perf/util/ui/browsers/hists.c102
17 files changed, 1023 insertions, 166 deletions
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index a5766b4b0125..a1386b2fff00 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -152,6 +152,36 @@ an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must ha
 corresponding events, i.e., they always refer to events defined earlier on the command
 line.
 
+-b::
+--branch-any::
+Enable taken branch stack sampling. Any type of taken branch may be sampled.
+This is a shortcut for --branch-filter any. See --branch-filter for more infos.
+
+-j::
+--branch-filter::
+Enable taken branch stack sampling. Each sample captures a series of consecutive
+taken branches. The number of branches captured with each sample depends on the
+underlying hardware, the type of branches of interest, and the executed code.
+It is possible to select the types of branches captured by enabling filters. The
+following filters are defined:
+
+        - any:  any type of branches
+        - any_call: any function call or system call
+        - any_ret: any function return or system call return
+        - any_ind: any indirect branch
+        - u:  only when the branch target is at the user level
+        - k: only when the branch target is in the kernel
+        - hv: only when the target is at the hypervisor level
+
++
+The option requires at least one branch type among any, any_call, any_ret, ind_call.
+The privilege levels may be ommitted, in which case, the privilege levels of the associated
+event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege
+levels are subject to permissions.  When sampling on multiple events, branch stack sampling
+is enabled for all the sampling events. The sampled branch type is the same for all events.
+The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
+Note that this feature may not be available on all processors.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 9b430e98712e..87feeee8b90c 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -153,6 +153,16 @@ OPTIONS
 	information which may be very large and thus may clutter the display.
 	It currently includes: cpu and numa topology of the host system.
 
+-b::
+--branch-stack::
+	Use the addresses of sampled taken branches instead of the instruction
+	address to build the histograms. To generate meaningful output, the
+	perf.data file must have been obtained using perf record -b or
+	perf record --branch-filter xxx where xxx is a branch filter option.
+	perf report is able to auto-detect whether a perf.data file contains
+	branch stacks and it will automatically switch to the branch view mode,
+	unless --no-branch-stack is used.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 75d230fef202..be4e1eee782e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -473,6 +473,9 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 	if (!have_tracepoints(&evsel_list->entries))
 		perf_header__clear_feat(&session->header, HEADER_TRACE_INFO);
 
+	if (!rec->opts.branch_stack)
+		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
+
 	if (!rec->file_new) {
 		err = perf_session__read_header(session, output);
 		if (err < 0)
@@ -638,6 +641,90 @@ out_delete_session:
 	return err;
 }
 
+#define BRANCH_OPT(n, m) \
+	{ .name = n, .mode = (m) }
+
+#define BRANCH_END { .name = NULL }
+
+struct branch_mode {
+	const char *name;
+	int mode;
+};
+
+static const struct branch_mode branch_modes[] = {
+	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
+	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
+	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
+	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
+	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
+	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
+	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
+	BRANCH_END
+};
+
+static int
+parse_branch_stack(const struct option *opt, const char *str, int unset)
+{
+#define ONLY_PLM \
+	(PERF_SAMPLE_BRANCH_USER	|\
+	 PERF_SAMPLE_BRANCH_KERNEL	|\
+	 PERF_SAMPLE_BRANCH_HV)
+
+	uint64_t *mode = (uint64_t *)opt->value;
+	const struct branch_mode *br;
+	char *s, *os = NULL, *p;
+	int ret = -1;
+
+	if (unset)
+		return 0;
+
+	/*
+	 * cannot set it twice, -b + --branch-filter for instance
+	 */
+	if (*mode)
+		return -1;
+
+	/* str may be NULL in case no arg is passed to -b */
+	if (str) {
+		/* because str is read-only */
+		s = os = strdup(str);
+		if (!s)
+			return -1;
+
+		for (;;) {
+			p = strchr(s, ',');
+			if (p)
+				*p = '\0';
+
+			for (br = branch_modes; br->name; br++) {
+				if (!strcasecmp(s, br->name))
+					break;
+			}
+			if (!br->name) {
+				ui__warning("unknown branch filter %s,"
+					    " check man page\n", s);
+				goto error;
+			}
+
+			*mode |= br->mode;
+
+			if (!p)
+				break;
+
+			s = p + 1;
+		}
+	}
+	ret = 0;
+
+	/* default to any branch */
+	if ((*mode & ~ONLY_PLM) == 0) {
+		*mode = PERF_SAMPLE_BRANCH_ANY;
+	}
+error:
+	free(os);
+	return ret;
+}
+
 static const char * const record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
@@ -727,6 +814,14 @@ const struct option record_options[] = {
 		     "monitor event in cgroup name only",
 		     parse_cgroups),
 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
+
+	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
+		     "branch any", "sample any taken branches",
+		     parse_branch_stack),
+
+	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
+		     "branch filter mask", "branch stack filter modes",
+		     parse_branch_stack),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 25d34d483e49..8e91c6eba18a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -53,6 +53,82 @@ struct perf_report {
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
+static int perf_report__add_branch_hist_entry(struct perf_tool *tool,
+					struct addr_location *al,
+					struct perf_sample *sample,
+					struct perf_evsel *evsel,
+				      struct machine *machine)
+{
+	struct perf_report *rep = container_of(tool, struct perf_report, tool);
+	struct symbol *parent = NULL;
+	int err = 0;
+	unsigned i;
+	struct hist_entry *he;
+	struct branch_info *bi, *bx;
+
+	if ((sort__has_parent || symbol_conf.use_callchain)
+	    && sample->callchain) {
+		err = machine__resolve_callchain(machine, evsel, al->thread,
+						 sample->callchain, &parent);
+		if (err)
+			return err;
+	}
+
+	bi = machine__resolve_bstack(machine, al->thread,
+				     sample->branch_stack);
+	if (!bi)
+		return -ENOMEM;
+
+	for (i = 0; i < sample->branch_stack->nr; i++) {
+		if (rep->hide_unresolved && !(bi[i].from.sym && bi[i].to.sym))
+			continue;
+		/*
+		 * The report shows the percentage of total branches captured
+		 * and not events sampled. Thus we use a pseudo period of 1.
+		 */
+		he = __hists__add_branch_entry(&evsel->hists, al, parent,
+				&bi[i], 1);
+		if (he) {
+			struct annotation *notes;
+			err = -ENOMEM;
+			bx = he->branch_info;
+			if (bx->from.sym && use_browser > 0) {
+				notes = symbol__annotation(bx->from.sym);
+				if (!notes->src
+				    && symbol__alloc_hist(bx->from.sym) < 0)
+					goto out;
+
+				err = symbol__inc_addr_samples(bx->from.sym,
+							       bx->from.map,
+							       evsel->idx,
+							       bx->from.al_addr);
+				if (err)
+					goto out;
+			}
+
+			if (bx->to.sym && use_browser > 0) {
+				notes = symbol__annotation(bx->to.sym);
+				if (!notes->src
+				    && symbol__alloc_hist(bx->to.sym) < 0)
+					goto out;
+
+				err = symbol__inc_addr_samples(bx->to.sym,
+							       bx->to.map,
+							       evsel->idx,
+							       bx->to.al_addr);
+				if (err)
+					goto out;
+			}
+			evsel->hists.stats.total_period += 1;
+			hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
+			err = 0;
+		} else
+			return -ENOMEM;
+	}
+out:
+	return err;
+}
+
 static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
 				      struct addr_location *al,
 				      struct perf_sample *sample,
@@ -126,14 +202,21 @@ static int process_sample_event(struct perf_tool *tool,
 	if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
 		return 0;
 
-	if (al.map != NULL)
-		al.map->dso->hit = 1;
+	if (sort__branch_mode == 1) {
+		if (perf_report__add_branch_hist_entry(tool, &al, sample,
+						       evsel, machine)) {
+			pr_debug("problem adding lbr entry, skipping event\n");
+			return -1;
+		}
+	} else {
+		if (al.map != NULL)
+			al.map->dso->hit = 1;
 
-	if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) {
-		pr_debug("problem incrementing symbol period, skipping event\n");
-		return -1;
+		if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) {
+			pr_debug("problem incrementing symbol period, skipping event\n");
+			return -1;
+		}
 	}
-
 	return 0;
 }
 
@@ -188,6 +271,15 @@ static int perf_report__setup_sample_type(struct perf_report *rep)
 			}
 	}
 
+	if (sort__branch_mode == 1) {
+		if (!(self->sample_type & PERF_SAMPLE_BRANCH_STACK)) {
+			fprintf(stderr, "selected -b but no branch data."
+					" Did you call perf record without"
+					" -b?\n");
+			return -1;
+		}
+	}
+
 	return 0;
 }
 
@@ -246,7 +338,7 @@ static int __cmd_report(struct perf_report *rep)
 {
 	int ret = -EINVAL;
 	u64 nr_samples;
-	struct perf_session *session;
+	struct perf_session *session = rep->session;
 	struct perf_evsel *pos;
 	struct map *kernel_map;
 	struct kmap *kernel_kmap;
@@ -254,13 +346,6 @@ static int __cmd_report(struct perf_report *rep)
 
 	signal(SIGINT, sig_handler);
 
-	session = perf_session__new(rep->input_name, O_RDONLY,
-				    rep->force, false, &rep->tool);
-	if (session == NULL)
-		return -ENOMEM;
-
-	rep->session = session;
-
 	if (rep->cpu_list) {
 		ret = perf_session__cpu_bitmap(session, rep->cpu_list,
 					       rep->cpu_bitmap);
@@ -427,9 +512,19 @@ setup:
 	return 0;
 }
 
+static int
+parse_branch_mode(const struct option *opt __used, const char *str __used, int unset)
+{
+	sort__branch_mode = !unset;
+	return 0;
+}
+
 int cmd_report(int argc, const char **argv, const char *prefix __used)
 {
+	struct perf_session *session;
 	struct stat st;
+	bool has_br_stack = false;
+	int ret = -1;
 	char callchain_default_opt[] = "fractal,0.5,callee";
 	const char * const report_usage[] = {
 		"perf report [<options>]",
@@ -477,7 +572,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	OPT_BOOLEAN(0, "stdio", &report.use_stdio,
 		    "Use the stdio interface"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol, parent"),
+		   "sort by key(s): pid, comm, dso, symbol, parent, dso_to,"
+		   " dso_from, symbol_to, symbol_from, mispredict"),
 	OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
 		    "Show sample percentage for different cpu modes"),
 	OPT_STRING('p', "parent", &parent_pattern, "regex",
@@ -517,6 +613,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
 		    "Show a column with the sum of periods"),
+	OPT_CALLBACK_NOOPT('b', "branch-stack", &sort__branch_mode, "",
+		    "use branch records for histogram filling", parse_branch_mode),
 	OPT_END()
 	};
 
@@ -536,11 +634,36 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 		else
 			report.input_name = "perf.data";
 	}
+	session = perf_session__new(report.input_name, O_RDONLY,
+				    report.force, false, &report.tool);
+	if (session == NULL)
+		return -ENOMEM;
 
-	if (strcmp(report.input_name, "-") != 0)
+	report.session = session;
+
+	has_br_stack = perf_header__has_feat(&session->header,
+					     HEADER_BRANCH_STACK);
+
+	if (sort__branch_mode == -1 && has_br_stack)
+		sort__branch_mode = 1;
+
+	/* sort__branch_mode could be 0 if --no-branch-stack */
+	if (sort__branch_mode == 1) {
+		/*
+		 * if no sort_order is provided, then specify
+		 * branch-mode specific order
+		 */
+		if (sort_order == default_sort_order)
+			sort_order = "comm,dso_from,symbol_from,"
+				     "dso_to,symbol_to";
+
+	}
+
+	if (strcmp(report.input_name, "-") != 0) {
 		setup_browser(true);
-	else
+	} else {
 		use_browser = 0;
+	}
 
 	/*
 	 * Only in the newt browser we are doing integrated annotation,
@@ -568,13 +691,13 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	}
 
 	if (symbol__init() < 0)
-		return -1;
+		goto error;
 
 	setup_sorting(report_usage, options);
 
 	if (parent_pattern != default_parent_pattern) {
 		if (sort_dimension__add("parent") < 0)
-			return -1;
+			goto error;
 
 		/*
 		 * Only show the parent fields if we explicitly
@@ -592,9 +715,20 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	if (argc)
 		usage_with_options(report_usage, options);
 
-	sort_entry__setup_elide(&sort_dso, symbol_conf.dso_list, "dso", stdout);
 	sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout);
-	sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout);
 
-	return __cmd_report(&report);
+	if (sort__branch_mode == 1) {
+		sort_entry__setup_elide(&sort_dso_from, symbol_conf.dso_from_list, "dso_from", stdout);
+		sort_entry__setup_elide(&sort_dso_to, symbol_conf.dso_to_list, "dso_to", stdout);
+		sort_entry__setup_elide(&sort_sym_from, symbol_conf.sym_from_list, "sym_from", stdout);
+		sort_entry__setup_elide(&sort_sym_to, symbol_conf.sym_to_list, "sym_to", stdout);
+	} else {
+		sort_entry__setup_elide(&sort_dso, symbol_conf.dso_list, "dso", stdout);
+		sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout);
+	}
+
+	ret = __cmd_report(&report);
+error:
+	perf_session__delete(session);
+	return ret;
 }
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index f0227e93665d..eec392e48067 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -179,6 +179,23 @@ struct ip_callchain {
 	u64 ips[0];
 };
 
+struct branch_flags {
+	u64 mispred:1;
+	u64 predicted:1;
+	u64 reserved:62;
+};
+
+struct branch_entry {
+	u64				from;
+	u64				to;
+	struct branch_flags flags;
+};
+
+struct branch_stack {
+	u64				nr;
+	struct branch_entry	entries[0];
+};
+
 extern bool perf_host, perf_guest;
 extern const char perf_version_string[];
 
@@ -205,6 +222,7 @@ struct perf_record_opts {
 	unsigned int freq;
 	unsigned int mmap_pages;
 	unsigned int user_freq;
+	int	     branch_stack;
 	u64	     default_interval;
 	u64	     user_interval;
 	const char   *cpu_list;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index cbdeaad9c5e5..1b197280c621 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -81,6 +81,7 @@ struct perf_sample {
 	u32 raw_size;
 	void *raw_data;
 	struct ip_callchain *callchain;
+	struct branch_stack *branch_stack;
 };
 
 #define BUILD_ID_SIZE 20
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 302d49a9f985..f421f7cbc0d3 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -126,6 +126,10 @@ void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts)
 		attr->watermark = 0;
 		attr->wakeup_events = 1;
 	}
+	if (opts->branch_stack) {
+		attr->sample_type	|= PERF_SAMPLE_BRANCH_STACK;
+		attr->branch_sample_type = opts->branch_stack;
+	}
 
 	attr->mmap = track;
 	attr->comm = track;
@@ -576,6 +580,16 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
 		data->raw_data = (void *) pdata;
 	}
 
+	if (type & PERF_SAMPLE_BRANCH_STACK) {
+		u64 sz;
+
+		data->branch_stack = (struct branch_stack *)array;
+		array++; /* nr */
+
+		sz = data->branch_stack->nr * sizeof(struct branch_entry);
+		sz /= sizeof(u64);
+		array += sz;
+	}
 	return 0;
 }
 
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 9f867d96c6a5..0d9b6da86a39 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1023,6 +1023,12 @@ write_it:
 	return do_write_string(fd, buffer);
 }
 
+static int write_branch_stack(int fd __used, struct perf_header *h __used,
+		       struct perf_evlist *evlist __used)
+{
+	return 0;
+}
+
 static void print_hostname(struct perf_header *ph, int fd, FILE *fp)
 {
 	char *str = do_read_string(fd, ph);
@@ -1144,8 +1150,9 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 	uint64_t id;
 	void *buf = NULL;
 	char *str;
-	u32 nre, sz, nr, i, j, msz;
-	int ret;
+	u32 nre, sz, nr, i, j;
+	ssize_t ret;
+	size_t msz;
 
 	/* number of events */
 	ret = read(fd, &nre, sizeof(nre));
@@ -1162,25 +1169,23 @@ static void print_event_desc(struct perf_header *ph, int fd, FILE *fp)
 	if (ph->needs_swap)
 		sz = bswap_32(sz);
 
-	/*
-	 * ensure it is at least to our ABI rev
-	 */
-	if (sz < (u32)sizeof(attr))
-		goto error;
-
 	memset(&attr, 0, sizeof(attr));
 
-	/* read entire region to sync up to next field */
+	/* buffer to hold on file attr struct */
 	buf = malloc(sz);
 	if (!buf)
 		goto error;
 
 	msz = sizeof(attr);
-	if (sz < msz)
+	if (sz < (ssize_t)msz)
 		msz = sz;
 
 	for (i = 0 ; i < nre; i++) {
 
+		/*
+		 * must read entire on-file attr struct to
+		 * sync up with layout.
+		 */
 		ret = read(fd, buf, sz);
 		if (ret != (ssize_t)sz)
 			goto error;
@@ -1316,6 +1321,12 @@ static void print_cpuid(struct perf_header *ph, int fd, FILE *fp)
 	free(str);
 }
 
+static void print_branch_stack(struct perf_header *ph __used, int fd __used,
+			       FILE *fp)
+{
+	fprintf(fp, "# contains samples with branch stack\n");
+}
+
 static int __event_process_build_id(struct build_id_event *bev,
 				    char *filename,
 				    struct perf_session *session)
@@ -1520,6 +1531,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPA(HEADER_CMDLINE,	cmdline),
 	FEAT_OPF(HEADER_CPU_TOPOLOGY,	cpu_topology),
 	FEAT_OPF(HEADER_NUMA_TOPOLOGY,	numa_topology),
+	FEAT_OPA(HEADER_BRANCH_STACK,	branch_stack),
 };
 
 struct header_print_data {
@@ -1804,35 +1816,101 @@ out_free:
 	return err;
 }
 
-static int check_magic_endian(u64 *magic, struct perf_file_header *header,
-			      struct perf_header *ph)
+static const int attr_file_abi_sizes[] = {
+	[0] = PERF_ATTR_SIZE_VER0,
+	[1] = PERF_ATTR_SIZE_VER1,
+	0,
+};
+
+/*
+ * In the legacy file format, the magic number is not used to encode endianness.
+ * hdr_sz was used to encode endianness. But given that hdr_sz can vary based
+ * on ABI revisions, we need to try all combinations for all endianness to
+ * detect the endianness.
+ */
+static int try_all_file_abis(uint64_t hdr_sz, struct perf_header *ph)
 {
-	int ret;
+	uint64_t ref_size, attr_size;
+	int i;
 
-	/* check for legacy format */
-	ret = memcmp(magic, __perf_magic1, sizeof(*magic));
-	if (ret == 0) {
-		pr_debug("legacy perf.data format\n");
-		if (!header)
-			return -1;
+	for (i = 0 ; attr_file_abi_sizes[i]; i++) {
+		ref_size = attr_file_abi_sizes[i]
+			 + sizeof(struct perf_file_section);
+		if (hdr_sz != ref_size) {
+			attr_size = bswap_64(hdr_sz);
+			if (attr_size != ref_size)
+				continue;
 
-		if (header->attr_size != sizeof(struct perf_file_attr)) {
-			u64 attr_size = bswap_64(header->attr_size);
+			ph->needs_swap = true;
+		}
+		pr_debug("ABI%d perf.data file detected, need_swap=%d\n",
+			 i,
+			 ph->needs_swap);
+		return 0;
+	}
+	/* could not determine endianness */
+	return -1;
+}
 
-			if (attr_size != sizeof(struct perf_file_attr))
-				return -1;
+#define PERF_PIPE_HDR_VER0	16
+
+static const size_t attr_pipe_abi_sizes[] = {
+	[0] = PERF_PIPE_HDR_VER0,
+	0,
+};
+
+/*
+ * In the legacy pipe format, there is an implicit assumption that endiannesss
+ * between host recording the samples, and host parsing the samples is the
+ * same. This is not always the case given that the pipe output may always be
+ * redirected into a file and analyzed on a different machine with possibly a
+ * different endianness and perf_event ABI revsions in the perf tool itself.
+ */
+static int try_all_pipe_abis(uint64_t hdr_sz, struct perf_header *ph)
+{
+	u64 attr_size;
+	int i;
+
+	for (i = 0 ; attr_pipe_abi_sizes[i]; i++) {
+		if (hdr_sz != attr_pipe_abi_sizes[i]) {
+			attr_size = bswap_64(hdr_sz);
+			if (attr_size != hdr_sz)
+				continue;
 
 			ph->needs_swap = true;
 		}
+		pr_debug("Pipe ABI%d perf.data file detected\n", i);
 		return 0;
 	}
+	return -1;
+}
+
+static int check_magic_endian(u64 magic, uint64_t hdr_sz,
+			      bool is_pipe, struct perf_header *ph)
+{
+	int ret;
+
+	/* check for legacy format */
+	ret = memcmp(&magic, __perf_magic1, sizeof(magic));
+	if (ret == 0) {
+		pr_debug("legacy perf.data format\n");
+		if (is_pipe)
+			return try_all_pipe_abis(hdr_sz, ph);
+
+		return try_all_file_abis(hdr_sz, ph);
+	}
+	/*
+	 * the new magic number serves two purposes:
+	 * - unique number to identify actual perf.data files
+	 * - encode endianness of file
+	 */
 
-	/* check magic number with same endianness */
-	if (*magic == __perf_magic2)
+	/* check magic number with one endianness */
+	if (magic == __perf_magic2)
 		return 0;
 
-	/* check magic number but opposite endianness */
-	if (*magic != __perf_magic2_sw)
+	/* check magic number with opposite endianness */
+	if (magic != __perf_magic2_sw)
 		return -1;
 
 	ph->needs_swap = true;
@@ -1851,8 +1929,11 @@ int perf_file_header__read(struct perf_file_header *header,
 	if (ret <= 0)
 		return -1;
 
-	if (check_magic_endian(&header->magic, header, ph) < 0)
+	if (check_magic_endian(header->magic,
+			       header->attr_size, false, ph) < 0) {
+		pr_debug("magic/endian check failed\n");
 		return -1;
+	}
 
 	if (ph->needs_swap) {
 		mem_bswap_64(header, offsetof(struct perf_file_header,
@@ -1939,21 +2020,17 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 	if (ret <= 0)
 		return -1;
 
-	 if (check_magic_endian(&header->magic, NULL, ph) < 0)
+	if (check_magic_endian(header->magic, header->size, true, ph) < 0) {
+		pr_debug("endian/magic failed\n");
 		return -1;
+	}
+
+	if (ph->needs_swap)
+		header->size = bswap_64(header->size);
 
 	if (repipe && do_write(STDOUT_FILENO, header, sizeof(*header)) < 0)
 		return -1;
 
-	if (header->size != sizeof(*header)) {
-		u64 size = bswap_64(header->size);
-
-		if (size != sizeof(*header))
-			return -1;
-
-		ph->needs_swap = true;
-	}
-
 	return 0;
 }
 
@@ -1973,6 +2050,52 @@ static int perf_header__read_pipe(struct perf_session *session, int fd)
 	return 0;
 }
 
+static int read_attr(int fd, struct perf_header *ph,
+		     struct perf_file_attr *f_attr)
+{
+	struct perf_event_attr *attr = &f_attr->attr;
+	size_t sz, left;
+	size_t our_sz = sizeof(f_attr->attr);
+	int ret;
+
+	memset(f_attr, 0, sizeof(*f_attr));
+
+	/* read minimal guaranteed structure */
+	ret = readn(fd, attr, PERF_ATTR_SIZE_VER0);
+	if (ret <= 0) {
+		pr_debug("cannot read %d bytes of header attr\n",
+			 PERF_ATTR_SIZE_VER0);
+		return -1;
+	}
+
+	/* on file perf_event_attr size */
+	sz = attr->size;
+
+	if (ph->needs_swap)
+		sz = bswap_32(sz);
+
+	if (sz == 0) {
+		/* assume ABI0 */
+		sz =  PERF_ATTR_SIZE_VER0;
+	} else if (sz > our_sz) {
+		pr_debug("file uses a more recent and unsupported ABI"
+			 " (%zu bytes extra)\n", sz - our_sz);
+		return -1;
+	}
+	/* what we have not yet read and that we know about */
+	left = sz - PERF_ATTR_SIZE_VER0;
+	if (left) {
+		void *ptr = attr;
+		ptr += PERF_ATTR_SIZE_VER0;
+
+		ret = readn(fd, ptr, left);
+	}
+	/* read perf_file_section, ids are read in caller */
+	ret = readn(fd, &f_attr->ids, sizeof(f_attr->ids));
+
+	return ret <= 0 ? -1 : 0;
+}
+
 int perf_session__read_header(struct perf_session *session, int fd)
 {
 	struct perf_header *header = &session->header;
@@ -1988,19 +2111,17 @@ int perf_session__read_header(struct perf_session *session, int fd)
 	if (session->fd_pipe)
 		return perf_header__read_pipe(session, fd);
 
-	if (perf_file_header__read(&f_header, header, fd) < 0) {
-		pr_debug("incompatible file format\n");
+	if (perf_file_header__read(&f_header, header, fd) < 0)
 		return -EINVAL;
-	}
 
-	nr_attrs = f_header.attrs.size / sizeof(f_attr);
+	nr_attrs = f_header.attrs.size / f_header.attr_size;
 	lseek(fd, f_header.attrs.offset, SEEK_SET);
 
 	for (i = 0; i < nr_attrs; i++) {
 		struct perf_evsel *evsel;
 		off_t tmp;
 
-		if (readn(fd, &f_attr, sizeof(f_attr)) <= 0)
+		if (read_attr(fd, header, &f_attr) < 0)
 			goto out_errno;
 
 		if (header->needs_swap)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index e68f617d082f..21a6be09c129 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -27,7 +27,7 @@ enum {
 	HEADER_EVENT_DESC,
 	HEADER_CPU_TOPOLOGY,
 	HEADER_NUMA_TOPOLOGY,
-
+	HEADER_BRANCH_STACK,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6f505d1abac7..8380c3db1c92 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -50,21 +50,25 @@ static void hists__reset_col_len(struct hists *hists)
 		hists__set_col_len(hists, col, 0);
 }
 
+static void hists__set_unres_dso_col_len(struct hists *hists, int dso)
+{
+	const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
+
+	if (hists__col_len(hists, dso) < unresolved_col_width &&
+	    !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
+	    !symbol_conf.dso_list)
+		hists__set_col_len(hists, dso, unresolved_col_width);
+}
+
 static void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 {
+	const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
 	u16 len;
 
 	if (h->ms.sym)
-		hists__new_col_len(hists, HISTC_SYMBOL, h->ms.sym->namelen);
-	else {
-		const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
-
-		if (hists__col_len(hists, HISTC_DSO) < unresolved_col_width &&
-		    !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
-		    !symbol_conf.dso_list)
-			hists__set_col_len(hists, HISTC_DSO,
-					   unresolved_col_width);
-	}
+		hists__new_col_len(hists, HISTC_SYMBOL, h->ms.sym->namelen + 4);
+	else
+		hists__set_unres_dso_col_len(hists, HISTC_DSO);
 
 	len = thread__comm_len(h->thread);
 	if (hists__new_col_len(hists, HISTC_COMM, len))
@@ -74,6 +78,37 @@ static void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 		len = dso__name_len(h->ms.map->dso);
 		hists__new_col_len(hists, HISTC_DSO, len);
 	}
+
+	if (h->branch_info) {
+		int symlen;
+		/*
+		 * +4 accounts for '[x] ' priv level info
+		 * +2 account of 0x prefix on raw addresses
+		 */
+		if (h->branch_info->from.sym) {
+			symlen = (int)h->branch_info->from.sym->namelen + 4;
+			hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
+
+			symlen = dso__name_len(h->branch_info->from.map->dso);
+			hists__new_col_len(hists, HISTC_DSO_FROM, symlen);
+		} else {
+			symlen = unresolved_col_width + 4 + 2;
+			hists__new_col_len(hists, HISTC_SYMBOL_FROM, symlen);
+			hists__set_unres_dso_col_len(hists, HISTC_DSO_FROM);
+		}
+
+		if (h->branch_info->to.sym) {
+			symlen = (int)h->branch_info->to.sym->namelen + 4;
+			hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
+
+			symlen = dso__name_len(h->branch_info->to.map->dso);
+			hists__new_col_len(hists, HISTC_DSO_TO, symlen);
+		} else {
+			symlen = unresolved_col_width + 4 + 2;
+			hists__new_col_len(hists, HISTC_SYMBOL_TO, symlen);
+			hists__set_unres_dso_col_len(hists, HISTC_DSO_TO);
+		}
+	}
 }
 
 static void hist_entry__add_cpumode_period(struct hist_entry *he,
@@ -195,26 +230,14 @@ static u8 symbol__parent_filter(const struct symbol *parent)
 	return 0;
 }
 
-struct hist_entry *__hists__add_entry(struct hists *hists,
+static struct hist_entry *add_hist_entry(struct hists *hists,
+				      struct hist_entry *entry,
 				      struct addr_location *al,
-				      struct symbol *sym_parent, u64 period)
+				      u64 period)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct hist_entry *he;
-	struct hist_entry entry = {
-		.thread	= al->thread,
-		.ms = {
-			.map	= al->map,
-			.sym	= al->sym,
-		},
-		.cpu	= al->cpu,
-		.ip	= al->addr,
-		.level	= al->level,
-		.period	= period,
-		.parent = sym_parent,
-		.filtered = symbol__parent_filter(sym_parent),
-	};
 	int cmp;
 
 	pthread_mutex_lock(&hists->lock);
@@ -225,7 +248,7 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 		parent = *p;
 		he = rb_entry(parent, struct hist_entry, rb_node_in);
 
-		cmp = hist_entry__cmp(&entry, he);
+		cmp = hist_entry__cmp(entry, he);
 
 		if (!cmp) {
 			he->period += period;
@@ -239,7 +262,7 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 			p = &(*p)->rb_right;
 	}
 
-	he = hist_entry__new(&entry);
+	he = hist_entry__new(entry);
 	if (!he)
 		goto out_unlock;
 
@@ -252,6 +275,51 @@ out_unlock:
 	return he;
 }
 
+struct hist_entry *__hists__add_branch_entry(struct hists *self,
+					     struct addr_location *al,
+					     struct symbol *sym_parent,
+					     struct branch_info *bi,
+					     u64 period)
+{
+	struct hist_entry entry = {
+		.thread	= al->thread,
+		.ms = {
+			.map	= bi->to.map,
+			.sym	= bi->to.sym,
+		},
+		.cpu	= al->cpu,
+		.ip	= bi->to.addr,
+		.level	= al->level,
+		.period	= period,
+		.parent = sym_parent,
+		.filtered = symbol__parent_filter(sym_parent),
+		.branch_info = bi,
+	};
+
+	return add_hist_entry(self, &entry, al, period);
+}
+
+struct hist_entry *__hists__add_entry(struct hists *self,
+				      struct addr_location *al,
+				      struct symbol *sym_parent, u64 period)
+{
+	struct hist_entry entry = {
+		.thread	= al->thread,
+		.ms = {
+			.map	= al->map,
+			.sym	= al->sym,
+		},
+		.cpu	= al->cpu,
+		.ip	= al->addr,
+		.level	= al->level,
+		.period	= period,
+		.parent = sym_parent,
+		.filtered = symbol__parent_filter(sym_parent),
+	};
+
+	return add_hist_entry(self, &entry, al, period);
+}
+
 int64_t
 hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 {
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 48e5acd1e862..9413f3e31fea 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -42,6 +42,11 @@ enum hist_column {
 	HISTC_COMM,
 	HISTC_PARENT,
 	HISTC_CPU,
+	HISTC_MISPREDICT,
+	HISTC_SYMBOL_FROM,
+	HISTC_SYMBOL_TO,
+	HISTC_DSO_FROM,
+	HISTC_DSO_TO,
 	HISTC_NR_COLS, /* Last entry */
 };
 
@@ -74,6 +79,12 @@ int hist_entry__snprintf(struct hist_entry *self, char *bf, size_t size,
 			 struct hists *hists);
 void hist_entry__free(struct hist_entry *);
 
+struct hist_entry *__hists__add_branch_entry(struct hists *self,
+					     struct addr_location *al,
+					     struct symbol *sym_parent,
+					     struct branch_info *bi,
+					     u64 period);
+
 void hists__output_resort(struct hists *self);
 void hists__output_resort_threaded(struct hists *hists);
 void hists__collapse_resort(struct hists *self);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9f833cf9c6a9..002ebbf59f48 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -24,7 +24,7 @@ static int perf_session__open(struct perf_session *self, bool force)
 		self->fd = STDIN_FILENO;
 
 		if (perf_session__read_header(self, self->fd) < 0)
-			pr_err("incompatible file format");
+			pr_err("incompatible file format (rerun with -v to learn more)");
 
 		return 0;
 	}
@@ -56,7 +56,7 @@ static int perf_session__open(struct perf_session *self, bool force)
 	}
 
 	if (perf_session__read_header(self, self->fd) < 0) {
-		pr_err("incompatible file format");
+		pr_err("incompatible file format (rerun with -v to learn more)");
 		goto out_close;
 	}
 
@@ -229,6 +229,64 @@ static bool symbol__match_parent_regex(struct symbol *sym)
 	return 0;
 }
 
+static const u8 cpumodes[] = {
+	PERF_RECORD_MISC_USER,
+	PERF_RECORD_MISC_KERNEL,
+	PERF_RECORD_MISC_GUEST_USER,
+	PERF_RECORD_MISC_GUEST_KERNEL
+};
+#define NCPUMODES (sizeof(cpumodes)/sizeof(u8))
+
+static void ip__resolve_ams(struct machine *self, struct thread *thread,
+			    struct addr_map_symbol *ams,
+			    u64 ip)
+{
+	struct addr_location al;
+	size_t i;
+	u8 m;
+
+	memset(&al, 0, sizeof(al));
+
+	for (i = 0; i < NCPUMODES; i++) {
+		m = cpumodes[i];
+		/*
+		 * We cannot use the header.misc hint to determine whether a
+		 * branch stack address is user, kernel, guest, hypervisor.
+		 * Branches may straddle the kernel/user/hypervisor boundaries.
+		 * Thus, we have to try consecutively until we find a match
+		 * or else, the symbol is unknown
+		 */
+		thread__find_addr_location(thread, self, m, MAP__FUNCTION,
+				ip, &al, NULL);
+		if (al.sym)
+			goto found;
+	}
+found:
+	ams->addr = ip;
+	ams->al_addr = al.addr;
+	ams->sym = al.sym;
+	ams->map = al.map;
+}
+
+struct branch_info *machine__resolve_bstack(struct machine *self,
+					    struct thread *thr,
+					    struct branch_stack *bs)
+{
+	struct branch_info *bi;
+	unsigned int i;
+
+	bi = calloc(bs->nr, sizeof(struct branch_info));
+	if (!bi)
+		return NULL;
+
+	for (i = 0; i < bs->nr; i++) {
+		ip__resolve_ams(self, thr, &bi[i].to, bs->entries[i].to);
+		ip__resolve_ams(self, thr, &bi[i].from, bs->entries[i].from);
+		bi[i].flags = bs->entries[i].flags;
+	}
+	return bi;
+}
+
 int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
 			       struct thread *thread,
 			       struct ip_callchain *chain,
@@ -697,6 +755,18 @@ static void callchain__printf(struct perf_sample *sample)
 		       i, sample->callchain->ips[i]);
 }
 
+static void branch_stack__printf(struct perf_sample *sample)
+{
+	uint64_t i;
+
+	printf("... branch stack: nr:%" PRIu64 "\n", sample->branch_stack->nr);
+
+	for (i = 0; i < sample->branch_stack->nr; i++)
+		printf("..... %2"PRIu64": %016" PRIx64 " -> %016" PRIx64 "\n",
+			i, sample->branch_stack->entries[i].from,
+			sample->branch_stack->entries[i].to);
+}
+
 static void perf_session__print_tstamp(struct perf_session *session,
 				       union perf_event *event,
 				       struct perf_sample *sample)
@@ -744,6 +814,9 @@ static void dump_sample(struct perf_session *session, union perf_event *event,
 
 	if (session->sample_type & PERF_SAMPLE_CALLCHAIN)
 		callchain__printf(sample);
+
+	if (session->sample_type & PERF_SAMPLE_BRANCH_STACK)
+		branch_stack__printf(sample);
 }
 
 static struct machine *
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index c8d90178e7de..7a5434c00565 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -73,6 +73,10 @@ int perf_session__resolve_callchain(struct perf_session *self, struct perf_evsel
 				    struct ip_callchain *chain,
 				    struct symbol **parent);
 
+struct branch_info *machine__resolve_bstack(struct machine *self,
+					    struct thread *thread,
+					    struct branch_stack *bs);
+
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
 void mem_bswap_64(void *src, int byte_size);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 16da30d8d765..88dbcf6f9575 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -8,6 +8,7 @@ const char	default_sort_order[] = "comm,dso,symbol";
 const char	*sort_order = default_sort_order;
 int		sort__need_collapse = 0;
 int		sort__has_parent = 0;
+int		sort__branch_mode = -1; /* -1 = means not set */
 
 enum sort_type	sort__first_dimension;
 
@@ -94,6 +95,26 @@ static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf,
 	return repsep_snprintf(bf, size, "%*s", width, self->thread->comm);
 }
 
+static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
+{
+	struct dso *dso_l = map_l ? map_l->dso : NULL;
+	struct dso *dso_r = map_r ? map_r->dso : NULL;
+	const char *dso_name_l, *dso_name_r;
+
+	if (!dso_l || !dso_r)
+		return cmp_null(dso_l, dso_r);
+
+	if (verbose) {
+		dso_name_l = dso_l->long_name;
+		dso_name_r = dso_r->long_name;
+	} else {
+		dso_name_l = dso_l->short_name;
+		dso_name_r = dso_r->short_name;
+	}
+
+	return strcmp(dso_name_l, dso_name_r);
+}
+
 struct sort_entry sort_comm = {
 	.se_header	= "Command",
 	.se_cmp		= sort__comm_cmp,
@@ -107,36 +128,74 @@ struct sort_entry sort_comm = {
 static int64_t
 sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	struct dso *dso_l = left->ms.map ? left->ms.map->dso : NULL;
-	struct dso *dso_r = right->ms.map ? right->ms.map->dso : NULL;
-	const char *dso_name_l, *dso_name_r;
+	return _sort__dso_cmp(left->ms.map, right->ms.map);
+}
 
-	if (!dso_l || !dso_r)
-		return cmp_null(dso_l, dso_r);
 
-	if (verbose) {
-		dso_name_l = dso_l->long_name;
-		dso_name_r = dso_r->long_name;
-	} else {
-		dso_name_l = dso_l->short_name;
-		dso_name_r = dso_r->short_name;
+static int64_t _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r,
+			      u64 ip_l, u64 ip_r)
+{
+	if (!sym_l || !sym_r)
+		return cmp_null(sym_l, sym_r);
+
+	if (sym_l == sym_r)
+		return 0;
+
+	if (sym_l)
+		ip_l = sym_l->start;
+	if (sym_r)
+		ip_r = sym_r->start;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static int _hist_entry__dso_snprintf(struct map *map, char *bf,
+				     size_t size, unsigned int width)
+{
+	if (map && map->dso) {
+		const char *dso_name = !verbose ? map->dso->short_name :
+			map->dso->long_name;
+		return repsep_snprintf(bf, size, "%-*s", width, dso_name);
 	}
 
-	return strcmp(dso_name_l, dso_name_r);
+	return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
 }
 
 static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
 				    size_t size, unsigned int width)
 {
-	if (self->ms.map && self->ms.map->dso) {
-		const char *dso_name = !verbose ? self->ms.map->dso->short_name :
-						  self->ms.map->dso->long_name;
-		return repsep_snprintf(bf, size, "%-*s", width, dso_name);
+	return _hist_entry__dso_snprintf(self->ms.map, bf, size, width);
+}
+
+static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
+				     u64 ip, char level, char *bf, size_t size,
+				     unsigned int width __used)
+{
+	size_t ret = 0;
+
+	if (verbose) {
+		char o = map ? dso__symtab_origin(map->dso) : '!';
+		ret += repsep_snprintf(bf, size, "%-#*llx %c ",
+				       BITS_PER_LONG / 4, ip, o);
 	}
 
-	return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
+	ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", level);
+	if (sym)
+		ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+				       width - ret,
+				       sym->name);
+	else {
+		size_t len = BITS_PER_LONG / 4;
+		ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx",
+				       len, ip);
+		ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+				       width - ret, "");
+	}
+
+	return ret;
 }
 
+
 struct sort_entry sort_dso = {
 	.se_header	= "Shared Object",
 	.se_cmp		= sort__dso_cmp,
@@ -144,8 +203,14 @@ struct sort_entry sort_dso = {
 	.se_width_idx	= HISTC_DSO,
 };
 
-/* --sort symbol */
+static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width __used)
+{
+	return _hist_entry__sym_snprintf(self->ms.map, self->ms.sym, self->ip,
+					 self->level, bf, size, width);
+}
 
+/* --sort symbol */
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -163,31 +228,7 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	ip_l = left->ms.sym->start;
 	ip_r = right->ms.sym->start;
 
-	return (int64_t)(ip_r - ip_l);
-}
-
-static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
-				    size_t size, unsigned int width __used)
-{
-	size_t ret = 0;
-
-	if (verbose) {
-		char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!';
-		ret += repsep_snprintf(bf, size, "%-#*llx %c ",
-				       BITS_PER_LONG / 4, self->ip, o);
-	}
-
-	if (!sort_dso.elide)
-		ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", self->level);
-
-	if (self->ms.sym)
-		ret += repsep_snprintf(bf + ret, size - ret, "%s",
-				       self->ms.sym->name);
-	else
-		ret += repsep_snprintf(bf + ret, size - ret, "%-#*llx",
-				       BITS_PER_LONG / 4, self->ip);
-
-	return ret;
+	return _sort__sym_cmp(left->ms.sym, right->ms.sym, ip_l, ip_r);
 }
 
 struct sort_entry sort_sym = {
@@ -246,19 +287,155 @@ struct sort_entry sort_cpu = {
 	.se_width_idx	= HISTC_CPU,
 };
 
+static int64_t
+sort__dso_from_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return _sort__dso_cmp(left->branch_info->from.map,
+			      right->branch_info->from.map);
+}
+
+static int hist_entry__dso_from_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width)
+{
+	return _hist_entry__dso_snprintf(self->branch_info->from.map,
+					 bf, size, width);
+}
+
+struct sort_entry sort_dso_from = {
+	.se_header	= "Source Shared Object",
+	.se_cmp		= sort__dso_from_cmp,
+	.se_snprintf	= hist_entry__dso_from_snprintf,
+	.se_width_idx	= HISTC_DSO_FROM,
+};
+
+static int64_t
+sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return _sort__dso_cmp(left->branch_info->to.map,
+			      right->branch_info->to.map);
+}
+
+static int hist_entry__dso_to_snprintf(struct hist_entry *self, char *bf,
+				       size_t size, unsigned int width)
+{
+	return _hist_entry__dso_snprintf(self->branch_info->to.map,
+					 bf, size, width);
+}
+
+static int64_t
+sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct addr_map_symbol *from_l = &left->branch_info->from;
+	struct addr_map_symbol *from_r = &right->branch_info->from;
+
+	if (!from_l->sym && !from_r->sym)
+		return right->level - left->level;
+
+	return _sort__sym_cmp(from_l->sym, from_r->sym, from_l->addr,
+			     from_r->addr);
+}
+
+static int64_t
+sort__sym_to_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct addr_map_symbol *to_l = &left->branch_info->to;
+	struct addr_map_symbol *to_r = &right->branch_info->to;
+
+	if (!to_l->sym && !to_r->sym)
+		return right->level - left->level;
+
+	return _sort__sym_cmp(to_l->sym, to_r->sym, to_l->addr, to_r->addr);
+}
+
+static int hist_entry__sym_from_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width __used)
+{
+	struct addr_map_symbol *from = &self->branch_info->from;
+	return _hist_entry__sym_snprintf(from->map, from->sym, from->addr,
+					 self->level, bf, size, width);
+
+}
+
+static int hist_entry__sym_to_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width __used)
+{
+	struct addr_map_symbol *to = &self->branch_info->to;
+	return _hist_entry__sym_snprintf(to->map, to->sym, to->addr,
+					 self->level, bf, size, width);
+
+}
+
+struct sort_entry sort_dso_to = {
+	.se_header	= "Target Shared Object",
+	.se_cmp		= sort__dso_to_cmp,
+	.se_snprintf	= hist_entry__dso_to_snprintf,
+	.se_width_idx	= HISTC_DSO_TO,
+};
+
+struct sort_entry sort_sym_from = {
+	.se_header	= "Source Symbol",
+	.se_cmp		= sort__sym_from_cmp,
+	.se_snprintf	= hist_entry__sym_from_snprintf,
+	.se_width_idx	= HISTC_SYMBOL_FROM,
+};
+
+struct sort_entry sort_sym_to = {
+	.se_header	= "Target Symbol",
+	.se_cmp		= sort__sym_to_cmp,
+	.se_snprintf	= hist_entry__sym_to_snprintf,
+	.se_width_idx	= HISTC_SYMBOL_TO,
+};
+
+static int64_t
+sort__mispredict_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	const unsigned char mp = left->branch_info->flags.mispred !=
+					right->branch_info->flags.mispred;
+	const unsigned char p = left->branch_info->flags.predicted !=
+					right->branch_info->flags.predicted;
+
+	return mp || p;
+}
+
+static int hist_entry__mispredict_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width){
+	static const char *out = "N/A";
+
+	if (self->branch_info->flags.predicted)
+		out = "N";
+	else if (self->branch_info->flags.mispred)
+		out = "Y";
+
+	return repsep_snprintf(bf, size, "%-*s", width, out);
+}
+
+struct sort_entry sort_mispredict = {
+	.se_header	= "Branch Mispredicted",
+	.se_cmp		= sort__mispredict_cmp,
+	.se_snprintf	= hist_entry__mispredict_snprintf,
+	.se_width_idx	= HISTC_MISPREDICT,
+};
+
 struct sort_dimension {
 	const char		*name;
 	struct sort_entry	*entry;
 	int			taken;
 };
 
+#define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
+
 static struct sort_dimension sort_dimensions[] = {
-	{ .name = "pid",	.entry = &sort_thread,	},
-	{ .name = "comm",	.entry = &sort_comm,	},
-	{ .name = "dso",	.entry = &sort_dso,	},
-	{ .name = "symbol",	.entry = &sort_sym,	},
-	{ .name = "parent",	.entry = &sort_parent,	},
-	{ .name = "cpu",	.entry = &sort_cpu,	},
+	DIM(SORT_PID, "pid", sort_thread),
+	DIM(SORT_COMM, "comm", sort_comm),
+	DIM(SORT_DSO, "dso", sort_dso),
+	DIM(SORT_DSO_FROM, "dso_from", sort_dso_from),
+	DIM(SORT_DSO_TO, "dso_to", sort_dso_to),
+	DIM(SORT_SYM, "symbol", sort_sym),
+	DIM(SORT_SYM_TO, "symbol_from", sort_sym_from),
+	DIM(SORT_SYM_FROM, "symbol_to", sort_sym_to),
+	DIM(SORT_PARENT, "parent", sort_parent),
+	DIM(SORT_CPU, "cpu", sort_cpu),
+	DIM(SORT_MISPREDICT, "mispredict", sort_mispredict),
 };
 
 int sort_dimension__add(const char *tok)
@@ -270,7 +447,6 @@ int sort_dimension__add(const char *tok)
 
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
-
 		if (sd->entry == &sort_parent) {
 			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
@@ -302,6 +478,16 @@ int sort_dimension__add(const char *tok)
 				sort__first_dimension = SORT_PARENT;
 			else if (!strcmp(sd->name, "cpu"))
 				sort__first_dimension = SORT_CPU;
+			else if (!strcmp(sd->name, "symbol_from"))
+				sort__first_dimension = SORT_SYM_FROM;
+			else if (!strcmp(sd->name, "symbol_to"))
+				sort__first_dimension = SORT_SYM_TO;
+			else if (!strcmp(sd->name, "dso_from"))
+				sort__first_dimension = SORT_DSO_FROM;
+			else if (!strcmp(sd->name, "dso_to"))
+				sort__first_dimension = SORT_DSO_TO;
+			else if (!strcmp(sd->name, "mispredict"))
+				sort__first_dimension = SORT_MISPREDICT;
 		}
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
@@ -309,7 +495,6 @@ int sort_dimension__add(const char *tok)
 
 		return 0;
 	}
-
 	return -ESRCH;
 }
 
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 3f67ae395752..472aa5a63a58 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -31,11 +31,16 @@ extern const char *parent_pattern;
 extern const char default_sort_order[];
 extern int sort__need_collapse;
 extern int sort__has_parent;
+extern int sort__branch_mode;
 extern char *field_sep;
 extern struct sort_entry sort_comm;
 extern struct sort_entry sort_dso;
 extern struct sort_entry sort_sym;
 extern struct sort_entry sort_parent;
+extern struct sort_entry sort_dso_from;
+extern struct sort_entry sort_dso_to;
+extern struct sort_entry sort_sym_from;
+extern struct sort_entry sort_sym_to;
 extern enum sort_type sort__first_dimension;
 
 /**
@@ -72,6 +77,7 @@ struct hist_entry {
 		struct hist_entry *pair;
 		struct rb_root	  sorted_chain;
 	};
+	struct branch_info	*branch_info;
 	struct callchain_root	callchain[0];
 };
 
@@ -82,6 +88,11 @@ enum sort_type {
 	SORT_SYM,
 	SORT_PARENT,
 	SORT_CPU,
+	SORT_DSO_FROM,
+	SORT_DSO_TO,
+	SORT_SYM_FROM,
+	SORT_SYM_TO,
+	SORT_MISPREDICT,
 };
 
 /*
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 2a683d4fc918..ac49ef208a5f 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -5,6 +5,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "map.h"
+#include "../perf.h"
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <stdio.h>
@@ -96,7 +97,11 @@ struct symbol_conf {
 			*col_width_list_str;
        struct strlist	*dso_list,
 			*comm_list,
-			*sym_list;
+			*sym_list,
+			*dso_from_list,
+			*dso_to_list,
+			*sym_from_list,
+			*sym_to_list;
 	const char	*symfs;
 };
 
@@ -120,6 +125,19 @@ struct map_symbol {
 	bool	      has_children;
 };
 
+struct addr_map_symbol {
+	struct map    *map;
+	struct symbol *sym;
+	u64	      addr;
+	u64	      al_addr;
+};
+
+struct branch_info {
+	struct addr_map_symbol from;
+	struct addr_map_symbol to;
+	struct branch_flags flags;
+};
+
 struct addr_location {
 	struct thread *thread;
 	struct map    *map;
diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c
index bfba0490c098..de8ece8bcce3 100644
--- a/tools/perf/util/ui/browsers/hists.c
+++ b/tools/perf/util/ui/browsers/hists.c
@@ -805,8 +805,11 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
 		self->hists = hists;
 		self->b.refresh = hist_browser__refresh;
 		self->b.seek = ui_browser__hists_seek;
-		self->b.use_navkeypressed = true,
-		self->has_symbols = sort_sym.list.next != NULL;
+		self->b.use_navkeypressed = true;
+		if (sort__branch_mode == 1)
+			self->has_symbols = sort_sym_from.list.next != NULL;
+		else
+			self->has_symbols = sort_sym.list.next != NULL;
 	}
 
 	return self;
@@ -853,6 +856,16 @@ static int hists__browser_title(struct hists *self, char *bf, size_t size,
 	return printed;
 }
 
+static inline void free_popup_options(char **options, int n)
+{
+	int i;
+
+	for (i = 0; i < n; ++i) {
+		free(options[i]);
+		options[i] = NULL;
+	}
+}
+
 static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    const char *helpline, const char *ev_name,
 				    bool left_exits,
@@ -861,7 +874,10 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 {
 	struct hists *self = &evsel->hists;
 	struct hist_browser *browser = hist_browser__new(self);
+	struct branch_info *bi;
 	struct pstack *fstack;
+	char *options[16];
+	int nr_options = 0;
 	int key = -1;
 
 	if (browser == NULL)
@@ -873,13 +889,16 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 
 	ui_helpline__push(helpline);
 
+	memset(options, 0, sizeof(options));
+
 	while (1) {
 		const struct thread *thread = NULL;
 		const struct dso *dso = NULL;
-		char *options[16];
-		int nr_options = 0, choice = 0, i,
+		int choice = 0,
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
-		    browse_map = -2;
+		    annotate_f = -2, annotate_t = -2, browse_map = -2;
+
+		nr_options = 0;
 
 		key = hist_browser__run(browser, ev_name, timer, arg, delay_secs);
 
@@ -887,7 +906,6 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			thread = hist_browser__selected_thread(browser);
 			dso = browser->selection->map ? browser->selection->map->dso : NULL;
 		}
-
 		switch (key) {
 		case K_TAB:
 		case K_UNTAB:
@@ -902,7 +920,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			if (!browser->has_symbols) {
 				ui_browser__warning(&browser->b, delay_secs * 2,
 			"Annotation is only available for symbolic views, "
-			"include \"sym\" in --sort to use it.");
+			"include \"sym*\" in --sort to use it.");
 				continue;
 			}
 
@@ -972,12 +990,34 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 		if (!browser->has_symbols)
 			goto add_exit_option;
 
-		if (browser->selection != NULL &&
-		    browser->selection->sym != NULL &&
-		    !browser->selection->map->dso->annotate_warned &&
-		    asprintf(&options[nr_options], "Annotate %s",
-			     browser->selection->sym->name) > 0)
-			annotate = nr_options++;
+		if (sort__branch_mode == 1) {
+			bi = browser->he_selection->branch_info;
+			if (browser->selection != NULL &&
+			    bi &&
+			    bi->from.sym != NULL &&
+			    !bi->from.map->dso->annotate_warned &&
+				asprintf(&options[nr_options], "Annotate %s",
+					 bi->from.sym->name) > 0)
+				annotate_f = nr_options++;
+
+			if (browser->selection != NULL &&
+			    bi &&
+			    bi->to.sym != NULL &&
+			    !bi->to.map->dso->annotate_warned &&
+			    (bi->to.sym != bi->from.sym ||
+			     bi->to.map->dso != bi->from.map->dso) &&
+				asprintf(&options[nr_options], "Annotate %s",
+					 bi->to.sym->name) > 0)
+				annotate_t = nr_options++;
+		} else {
+
+			if (browser->selection != NULL &&
+			    browser->selection->sym != NULL &&
+			    !browser->selection->map->dso->annotate_warned &&
+				asprintf(&options[nr_options], "Annotate %s",
+					 browser->selection->sym->name) > 0)
+				annotate = nr_options++;
+		}
 
 		if (thread != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
@@ -998,25 +1038,39 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			browse_map = nr_options++;
 add_exit_option:
 		options[nr_options++] = (char *)"Exit";
-
+retry_popup_menu:
 		choice = ui__popup_menu(nr_options, options);
 
-		for (i = 0; i < nr_options - 1; ++i)
-			free(options[i]);
-
 		if (choice == nr_options - 1)
 			break;
 
-		if (choice == -1)
+		if (choice == -1) {
+			free_popup_options(options, nr_options - 1);
 			continue;
+		}
 
-		if (choice == annotate) {
+		if (choice == annotate || choice == annotate_t || choice == annotate_f) {
 			struct hist_entry *he;
 			int err;
 do_annotate:
 			he = hist_browser__selected_entry(browser);
 			if (he == NULL)
 				continue;
+
+			/*
+			 * we stash the branch_info symbol + map into the
+			 * the ms so we don't have to rewrite all the annotation
+			 * code to use branch_info.
+			 * in branch mode, the ms struct is not used
+			 */
+			if (choice == annotate_f) {
+				he->ms.sym = he->branch_info->from.sym;
+				he->ms.map = he->branch_info->from.map;
+			}  else if (choice == annotate_t) {
+				he->ms.sym = he->branch_info->to.sym;
+				he->ms.map = he->branch_info->to.map;
+			}
+
 			/*
 			 * Don't let this be freed, say, by hists__decay_entry.
 			 */
@@ -1024,9 +1078,18 @@ do_annotate:
 			err = hist_entry__tui_annotate(he, evsel->idx,
 						       timer, arg, delay_secs);
 			he->used = false;
+			/*
+			 * offer option to annotate the other branch source or target
+			 * (if they exists) when returning from annotate
+			 */
+			if ((err == 'q' || err == CTRL('c'))
+			    && annotate_t != -2 && annotate_f != -2)
+				goto retry_popup_menu;
+
 			ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
 			if (err)
 				ui_browser__handle_resize(&browser->b);
+
 		} else if (choice == browse_map)
 			map__browse(browser->selection->map);
 		else if (choice == zoom_dso) {
@@ -1072,6 +1135,7 @@ out_free_stack:
 	pstack__delete(fstack);
 out:
 	hist_browser__delete(browser);
+	free_popup_options(options, nr_options - 1);
 	return key;
 }