Skip to content

Commit 5360cb3

Browse files
authored
gh-101282: Apply BOLT optimizations to libpython for shared builds (#104709)
Apply BOLT optimizations to libpython for shared builds. Most of the C code is in libpython so it is critical to apply BOLT there fully realize BOLT benefits. This change also reworks how BOLT instrumentation is applied. It effectively removes the readelf based logic added in gh-101525 and replaces it with a mechanism that saves a copy of the pre-bolt binary and restores that copy when necessary. This allows us to perform BOLT optimizations without having to manually delete the output binary to force a new bolt run. Also: - add a clean-bolt target for purging BOLT files and hook that up to the clean target - .gitignore BOLT related files Before and after this refactor, `make` will no-op after a previous run. Both versions should also share common make DAG deficiencies where targets fail to trigger as often as they need to or can trigger prematurely in certain scenarios. e.g. after this change you may need to `rm profile-bolt-stamp` to force a BOLT run because there aren't appropriate non-phony targets for BOLT's make target to depend on. To make it easier to iterate on custom BOLT settings, the flags to pass to instrumentation and application are now defined in configure and can be overridden by passing BOLT_INSTRUMENT_FLAGS and BOLT_APPLY_FLAGS.
1 parent 729b252 commit 5360cb3

File tree

6 files changed

+153
-130
lines changed

6 files changed

+153
-130
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
*.gc??
2424
*.profclang?
2525
*.profraw
26+
# Copies of binaries before BOLT optimizations.
27+
*.prebolt
28+
# BOLT profile data.
29+
*.fdata
2630
*.dyn
2731
.gdb_history
2832
.purify
@@ -124,6 +128,7 @@ Tools/unicode/data/
124128
/platform
125129
/profile-clean-stamp
126130
/profile-run-stamp
131+
/profile-bolt-stamp
127132
/Python/deepfreeze/*.c
128133
/pybuilddir.txt
129134
/pyconfig.h

Doc/using/configure.rst

+7
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,13 @@ also be used to improve performance.
314314
is dependent on a combination of the build environment + the other
315315
optimization configure args + the CPU architecture, and not all combinations
316316
are supported.
317+
BOLT versions before LLVM 16 are known to crash BOLT under some scenarios.
318+
Use of LLVM 16 or newer for BOLT optimization is strongly encouraged.
319+
320+
The :envvar:`!BOLT_INSTRUMENT_FLAGS` and :envvar:`!BOLT_APPLY_FLAGS`
321+
:program:`configure` variables can be defined to override the default set of
322+
arguments for :program:`llvm-bolt` to instrument and apply BOLT data to
323+
binaries, respectively.
317324

318325
.. versionadded:: 3.12
319326

Makefile.pre.in

+50-15
Original file line numberDiff line numberDiff line change
@@ -672,21 +672,55 @@ profile-opt: profile-run-stamp
672672
-rm -f profile-clean-stamp
673673
$(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)"
674674

675-
.PHONY: bolt-opt
676-
bolt-opt: @PREBOLT_RULE@
675+
# List of binaries that BOLT runs on.
676+
BOLT_BINARIES := @BOLT_BINARIES@
677+
678+
BOLT_INSTRUMENT_FLAGS := @BOLT_INSTRUMENT_FLAGS@
679+
BOLT_APPLY_FLAGS := @BOLT_APPLY_FLAGS@
680+
681+
.PHONY: clean-bolt
682+
clean-bolt:
683+
# Profile data.
677684
rm -f *.fdata
678-
@if $(READELF) -p .note.bolt_info $(BUILDPYTHON) | grep BOLT > /dev/null; then\
679-
echo "skip: $(BUILDPYTHON) is already BOLTed."; \
680-
else \
681-
@LLVM_BOLT@ ./$(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst; \
682-
./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true; \
683-
@MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata; \
684-
@LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot; \
685-
rm -f *.fdata; \
686-
rm -f $(BUILDPYTHON).bolt_inst; \
687-
mv $(BUILDPYTHON).bolt $(BUILDPYTHON); \
688-
fi
685+
# Pristine binaries before BOLT optimization.
686+
rm -f *.prebolt
687+
# BOLT instrumented binaries.
688+
rm -f *.bolt_inst
689+
690+
profile-bolt-stamp: $(BUILDPYTHON)
691+
# Ensure a pristine, pre-BOLT copy of the binary and no profile data from last run.
692+
for bin in $(BOLT_BINARIES); do \
693+
prebolt="$${bin}.prebolt"; \
694+
if [ -e "$${prebolt}" ]; then \
695+
echo "Restoring pre-BOLT binary $${prebolt}"; \
696+
mv "$${bin}.prebolt" "$${bin}"; \
697+
fi; \
698+
cp "$${bin}" "$${prebolt}"; \
699+
rm -f $${bin}.bolt.*.fdata $${bin}.fdata; \
700+
done
701+
# Instrument each binary.
702+
for bin in $(BOLT_BINARIES); do \
703+
@LLVM_BOLT@ "$${bin}" -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $${bin}.bolt) -o $${bin}.bolt_inst $(BOLT_INSTRUMENT_FLAGS); \
704+
mv "$${bin}.bolt_inst" "$${bin}"; \
705+
done
706+
# Run instrumented binaries to collect data.
707+
$(RUNSHARED) ./$(BUILDPYTHON) $(PROFILE_TASK) || true
708+
# Merge all the data files together.
709+
for bin in $(BOLT_BINARIES); do \
710+
@MERGE_FDATA@ $${bin}.*.fdata > "$${bin}.fdata"; \
711+
rm -f $${bin}.*.fdata; \
712+
done
713+
# Run bolt against the merged data to produce an optimized binary.
714+
for bin in $(BOLT_BINARIES); do \
715+
@LLVM_BOLT@ "$${bin}.prebolt" -o "$${bin}.bolt" -data="$${bin}.fdata" $(BOLT_APPLY_FLAGS); \
716+
mv "$${bin}.bolt" "$${bin}"; \
717+
done
718+
touch $@
689719

720+
.PHONY: bolt-opt
721+
bolt-opt:
722+
$(MAKE) @PREBOLT_RULE@
723+
$(MAKE) profile-bolt-stamp
690724

691725
# Compile and run with gcov
692726
.PHONY: coverage
@@ -2623,10 +2657,11 @@ profile-removal:
26232657
rm -f $(COVERAGE_INFO)
26242658
rm -rf $(COVERAGE_REPORT)
26252659
rm -f profile-run-stamp
2660+
rm -f profile-bolt-stamp
26262661

26272662
.PHONY: clean
2628-
clean: clean-retain-profile
2629-
@if test @DEF_MAKE_ALL_RULE@ = profile-opt; then \
2663+
clean: clean-retain-profile clean-bolt
2664+
@if test @DEF_MAKE_ALL_RULE@ = profile-opt -o @DEF_MAKE_ALL_RULE@ = bolt-opt; then \
26302665
rm -f profile-gen-stamp profile-clean-stamp; \
26312666
$(MAKE) profile-removal; \
26322667
fi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
BOLT optimization is now applied to the libpython shared library if building
2+
a shared library. BOLT instrumentation and application settings can now be
3+
influenced via the ``BOLT_INSTRUMENT_FLAGS`` and ``BOLT_APPLY_FLAGS``
4+
configure variables.

configure

+39-108
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

configure.ac

+48-7
Original file line numberDiff line numberDiff line change
@@ -2028,13 +2028,6 @@ if test "$Py_BOLT" = 'true' ; then
20282028
DEF_MAKE_ALL_RULE="bolt-opt"
20292029
DEF_MAKE_RULE="build_all"
20302030

2031-
AC_SUBST(READELF)
2032-
AC_CHECK_TOOLS(READELF, [readelf], "notfound")
2033-
if test "$READELF" == "notfound"
2034-
then
2035-
AC_MSG_ERROR([readelf is required for a --enable-bolt build but could not be found.])
2036-
fi
2037-
20382031
# -fno-reorder-blocks-and-partition is required for bolt to work.
20392032
# Possibly GCC only.
20402033
AX_CHECK_COMPILE_FLAG([-fno-reorder-blocks-and-partition],[
@@ -2067,6 +2060,54 @@ if test "$Py_BOLT" = 'true' ; then
20672060
fi
20682061
fi
20692062

2063+
dnl Enable BOLT of libpython if built.
2064+
AC_SUBST(BOLT_BINARIES)
2065+
BOLT_BINARIES='$(BUILDPYTHON)'
2066+
AS_VAR_IF([enable_shared], [yes], [
2067+
BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)"
2068+
])
2069+
2070+
AC_ARG_VAR(
2071+
[BOLT_INSTRUMENT_FLAGS],
2072+
[Arguments to llvm-bolt when instrumenting binaries]
2073+
)
2074+
AC_MSG_CHECKING([BOLT_INSTRUMENT_FLAGS])
2075+
if test -z "${BOLT_INSTRUMENT_FLAGS}"
2076+
then
2077+
BOLT_INSTRUMENT_FLAGS=
2078+
fi
2079+
AC_MSG_RESULT([$BOLT_INSTRUMENT_FLAGS])
2080+
2081+
AC_ARG_VAR(
2082+
[BOLT_APPLY_FLAGS],
2083+
[Arguments to llvm-bolt when creating a BOLT optimized binary]
2084+
)
2085+
AC_MSG_CHECKING([BOLT_APPLY_FLAGS])
2086+
if test -z "${BOLT_APPLY_FLAGS}"
2087+
then
2088+
AS_VAR_SET(
2089+
[BOLT_APPLY_FLAGS],
2090+
[m4_join([ ],
2091+
[-update-debug-sections],
2092+
[-reorder-blocks=ext-tsp],
2093+
[-reorder-functions=hfsort+],
2094+
[-split-functions],
2095+
[-icf=1],
2096+
[-inline-all],
2097+
[-split-eh],
2098+
[-reorder-functions-use-hot-size],
2099+
[-peepholes=none],
2100+
[-jump-tables=aggressive],
2101+
[-inline-ap],
2102+
[-indirect-call-promotion=all],
2103+
[-dyno-stats],
2104+
[-use-gnu-stack],
2105+
[-frame-opt=hot]
2106+
)]
2107+
)
2108+
fi
2109+
AC_MSG_RESULT([$BOLT_APPLY_FLAGS])
2110+
20702111
# XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be
20712112
# merged with this chunk of code?
20722113

0 commit comments

Comments
 (0)