perf_trampoline.c | Explore in Territory

/*

Perf trampoline instrumentation
===============================

This file contains instrumentation to allow to associate
calls to the CPython eval loop back to the names of the Python
functions and filename being executed.

Many native performance profilers like the Linux perf tools are
only available to 'see' the C stack when sampling from the profiled
process. This means that if we have the following python code:

    import time
    def foo(n):
        # Some CPU intensive code

    def bar(n):
        foo(n)

    def baz(n):
        bar(n)

    baz(10000000)

A performance profiler that is only able to see native frames will
produce the following backtrace when sampling from foo():

    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    ...

    Py_RunMain

Because the profiler is only able to see the native frames and the native
function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
then the profiler and any reporter generated by it will not be able to
associate the names of the Python functions and the filenames associated with
those calls, rendering the results useless in the Python world.

To fix this problem, we introduce the concept of a trampoline frame. A
trampoline frame is a piece of code that is unique per Python code object that
is executed before entering the CPython eval loop. This piece of code just
calls the original Python evaluation function (_PyEval_EvalFrameDefault) and
forwards all the arguments received. In this way, when a profiler samples
frames from the previous example it will see;

    _PyEval_EvalFrameDefault -----> Evaluation frame of foo()
    [Jit compiled code 3]
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault ------> Evaluation frame of bar()
    [Jit compiled code 2]
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    _PyEval_EvalFrameDefault -------> Evaluation frame of baz()
    [Jit compiled code 1]
    _PyEval_EvalFrame
    _PyEval_Vector
    _PyFunction_Vectorcall
    PyObject_Vectorcall
    call_function

    ...

    Py_RunMain

When we generate every unique copy of the trampoline (what here we called "[Jit
compiled code N]") we write the relationship between the compiled code and the
Python function that is associated with it. Every profiler requires this
information in a different format. For example, the Linux "perf" profiler
requires a file in "/tmp/perf-PID.map" (name and location not configurable)
with the following format:

    <compiled code address> <compiled code size> <name of the compiled code>

If this file is available when "perf" generates reports, it will automatically
associate every trampoline with the Python function that it is associated with
allowing it to generate reports that include Python information. These reports
then can also be filtered in a way that *only* Python information appears.

Notice that for this to work, there must be a unique copied of the trampoline
per Python code object even if the code in the trampoline is the same. To
achieve this we have a assembly template in Objects/asm_trampiline.S that is
compiled into the Python executable/shared library. This template generates a
symbol that maps the start of the assembly code and another that marks the end
of the assembly code for the trampoline.  Then, every time we need a unique
trampoline for a Python code object, we copy the assembly code into a mmaped
area that has executable permissions and we return the start of that area as
our trampoline function.

Asking for a mmap-ed memory area for trampoline is very wasteful so we
allocate big arenas of memory in a single mmap call, we populate the entire
arena with copies of the trampoline (this allows us to now have to invalidate
the icache for the instructions in the page) and then we return the next
available chunk every time someone asks for a new trampoline. We keep a linked
list of arenas in case the current memory arena is exhausted and another one is
needed.

For the best results, Python should be compiled with
CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows
profilers to unwind using only the frame pointer and not on DWARF debug
information (note that as trampilines are dynamically generated there won't be
any DWARF information available for them).
*/

#include "Python.h"
#include "pycore_ceval.h"         // _PyPerf_Callbacks
#include "pycore_frame.h"
#include "pycore_interp.h"


#ifdef PY_HAVE_PERF_TRAMPOLINE

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>             // mmap()
#include <sys/types.h>
#include <unistd.h>               // sysconf()
#include <sys/time.h>           // gettimeofday()


#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
#define PY_HAVE_INVALIDATE_ICACHE

#if defined(__clang__) || defined(__GNUC__)
extern void __clear_cache(void *, void*);
#endif

static void invalidate_icache(char* begin, char*end) {
#if defined(__clang__) || defined(__GNUC__)
    return __clear_cache(begin, end);
#else
    return;
#endif
}
#endif

/* The function pointer is passed as last argument. The other three arguments
 * are passed in the same order as the function requires. This results in
 * shorter, more efficient ASM code for trampoline.
 */
py_evaluator;
py_trampoline;

extern void *_Py_trampoline_func_start;  // Start of the template of the
                                         // assembly trampoline
extern void *
    _Py_trampoline_func_end;  // End of the template of the assembly trampoline

struct code_arena_st { … };

code_arena_t;
trampoline_api_t;

enum perf_trampoline_type { … };

#define perf_status …
#define extra_code_index …
#define perf_code_arena …
#define trampoline_api …
#define perf_map_file …
#define persist_after_fork …
#define perf_trampoline_type …

static void
perf_map_write_entry(void *state, const void *code_addr,
                         unsigned int code_size, PyCodeObject *co)
{ … }

static void*
perf_map_init_state(void)
{ … }

static int
perf_map_free_state(void *state)
{ … }

_PyPerf_Callbacks _Py_perfmap_callbacks = …;


static size_t round_up(int64_t value, int64_t multiple) { … }

// TRAMPOLINE MANAGEMENT API

static int
new_code_arena(void)
{ … }

static void
free_code_arenas(void)
{ … }

static inline py_trampoline
code_arena_new_code(code_arena_t *code_arena)
{ … }

static inline py_trampoline
compile_trampoline(void)
{ … }

static PyObject *
py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
                        int throw)
{ … }
#endif  // PY_HAVE_PERF_TRAMPOLINE

int PyUnstable_PerfTrampoline_CompileCode(PyCodeObject *co)
{ … }

int
_PyIsPerfTrampolineActive(void)
{ … }

void
_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks)
{ … }

int
_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks)
{ … }

int
_PyPerfTrampoline_Init(int activate)
{ … }

int
_PyPerfTrampoline_Fini(void)
{ … }

void _PyPerfTrampoline_FreeArenas(void) { … }

int
PyUnstable_PerfTrampoline_SetPersistAfterFork(int enable){ … }

PyStatus
_PyPerfTrampoline_AfterFork_Child(void)
{ … }
cpython/Python/perf_trampoline.c