/* * Python Perf Trampoline Support - JIT Dump Implementation * * This file implements the perf jitdump API for Python's performance profiling * integration. It allows perf (Linux performance analysis tool) to understand * and profile dynamically generated Python bytecode by creating JIT dump files * that perf can inject into its analysis. * * * IMPORTANT: This file exports specific callback functions that are part of * Python's internal API. Do not modify the function signatures or behavior * of exported functions without coordinating with the Python core team. * * Usually the binary and libraries are mapped in separate region like below: * * address -> * --+---------------------+--//--+---------------------+-- * | .text | .data | ... | | .text | .data | ... | * --+---------------------+--//--+---------------------+-- * myprog libc.so * * So it'd be easy and straight-forward to find a mapped binary or library from an * address. * * But for JIT code, the code arena only cares about the code section. But the * resulting DSOs (which is generated by perf inject -j) contain ELF headers and * unwind info too. Then it'd generate following address space with synthesized * MMAP events. Let's say it has a sample between address B and C. * * sample * | * address -> A B v C * --------------------------------------------------------------------------------------------------- * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | * ... * --------------------------------------------------------------------------------------------------- * * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see * the unwind info. If it maps both .text section and unwind sections, the sample * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing * which one is right. So to make perf happy we have non-overlapping ranges for each * DSO: * * address -> * ------------------------------------------------------------------------------------------------------- * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | * ... * ------------------------------------------------------------------------------------------------------- * * As the trampolines are constant, we add a constant padding but in general the padding needs to have the * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 */ #include "Python.h" #include "pycore_ceval.h" // _PyPerf_Callbacks #include "pycore_frame.h" #include "pycore_interp.h" #include "pycore_runtime.h" // _PyRuntime #ifdef PY_HAVE_PERF_TRAMPOLINE /* Standard library includes for perf jitdump implementation */ #include // ELF architecture constants #include // File control operations #include // Standard I/O operations #include // Standard library functions #include // Memory mapping functions (mmap) #include // System data types #include // System calls (sysconf, getpid) #include // Time functions (gettimeofday) #include // System call interface // ============================================================================= // CONSTANTS AND CONFIGURATION // ============================================================================= /* * Memory layout considerations for perf jitdump: * * Perf expects non-overlapping memory regions for each JIT-compiled function. * When perf processes the jitdump file, it creates synthetic DSO (Dynamic * Shared Object) files that contain: * - ELF headers * - .text section (actual machine code) * - Unwind information (for stack traces) * * To ensure proper address space layout, we add padding between code regions. * This prevents address conflicts when perf maps the synthesized DSOs. * * Memory layout example: * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] * * The padding size (0x100) is chosen to accommodate typical unwind info sizes * while maintaining 16-byte alignment requirements. */ #define PERF_JIT_CODE_PADDING 0x100 /* Convenient access to the global trampoline API state */ #define trampoline_api _PyRuntime.ceval.perf.trampoline_api /* Type aliases for clarity and portability */ typedef uint64_t uword; // Word-sized unsigned integer typedef const char* CodeComments; // Code comment strings /* Memory size constants */ #define MB (1024 * 1024) // 1 Megabyte for buffer sizing // ============================================================================= // ARCHITECTURE-SPECIFIC DEFINITIONS // ============================================================================= /* * Returns the ELF machine architecture constant for the current platform. * This is required for the jitdump header to correctly identify the target * architecture for perf processing. * */ static uint64_t GetElfMachineArchitecture(void) { #if defined(__x86_64__) || defined(_M_X64) return EM_X86_64; #elif defined(__i386__) || defined(_M_IX86) return EM_386; #elif defined(__aarch64__) return EM_AARCH64; #elif defined(__arm__) || defined(_M_ARM) return EM_ARM; #elif defined(__riscv) return EM_RISCV; #else Py_UNREACHABLE(); // Unsupported architecture - should never reach here return 0; #endif } // ============================================================================= // PERF JITDUMP DATA STRUCTURES // ============================================================================= /* * Perf jitdump file format structures * * These structures define the binary format that perf expects for JIT dump files. * The format is documented in the Linux perf tools source code and must match * exactly for proper perf integration. */ /* * Jitdump file header - written once at the beginning of each jitdump file * Contains metadata about the process and jitdump format version */ typedef struct { uint32_t magic; // Magic number (0x4A695444 = "JiTD") uint32_t version; // Jitdump format version (currently 1) uint32_t size; // Size of this header structure uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture) uint32_t reserved; // Reserved field (must be 0) uint32_t process_id; // Process ID of the JIT compiler uint64_t time_stamp; // Timestamp when jitdump was created uint64_t flags; // Feature flags (currently unused) } Header; /* * Perf event types supported by the jitdump format * Each event type has a corresponding structure format */ enum PerfEvent { PerfLoad = 0, // Code load event (new JIT function) PerfMove = 1, // Code move event (function relocated) PerfDebugInfo = 2, // Debug information event PerfClose = 3, // JIT session close event PerfUnwindingInfo = 4 // Stack unwinding information event }; /* * Base event structure - common header for all perf events * Every event in the jitdump file starts with this structure */ struct BaseEvent { uint32_t event; // Event type (from PerfEvent enum) uint32_t size; // Total size of this event including payload uint64_t time_stamp; // Timestamp when event occurred }; /* * Code load event - indicates a new JIT-compiled function is available * This is the most important event type for Python profiling */ typedef struct { struct BaseEvent base; // Common event header uint32_t process_id; // Process ID where code was generated uint32_t thread_id; // Thread ID where code was generated uint64_t vma; // Virtual memory address where code is loaded uint64_t code_address; // Address of the actual machine code uint64_t code_size; // Size of the machine code in bytes uint64_t code_id; // Unique identifier for this code region /* Followed by: * - null-terminated function name string * - raw machine code bytes */ } CodeLoadEvent; /* * Code unwinding information event - provides DWARF data for stack traces * Essential for proper stack unwinding during profiling */ typedef struct { struct BaseEvent base; // Common event header uint64_t unwind_data_size; // Size of the unwinding data uint64_t eh_frame_hdr_size; // Size of the EH frame header uint64_t mapped_size; // Total mapped size (with padding) /* Followed by: * - EH frame header * - DWARF unwinding information * - Padding to alignment boundary */ } CodeUnwindingInfoEvent; // ============================================================================= // GLOBAL STATE MANAGEMENT // ============================================================================= /* * Global state for the perf jitdump implementation * * This structure maintains all the state needed for generating jitdump files. * It's designed as a singleton since there's typically only one jitdump file * per Python process. */ typedef struct { FILE* perf_map; // File handle for the jitdump file PyThread_type_lock map_lock; // Thread synchronization lock void* mapped_buffer; // Memory-mapped region (signals perf we're active) size_t mapped_size; // Size of the mapped region int code_id; // Counter for unique code region identifiers } PerfMapJitState; /* Global singleton instance */ static PerfMapJitState perf_jit_map_state; // ============================================================================= // TIME UTILITIES // ============================================================================= /* Time conversion constant */ static const intptr_t nanoseconds_per_second = 1000000000; /* * Get current monotonic time in nanoseconds * * Monotonic time is preferred for event timestamps because it's not affected * by system clock adjustments. This ensures consistent timing relationships * between events even if the system clock is changed. * * Returns: Current monotonic time in nanoseconds since an arbitrary epoch */ static int64_t get_current_monotonic_ticks(void) { struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { Py_UNREACHABLE(); // Should never fail on supported systems return 0; } /* Convert to nanoseconds for maximum precision */ int64_t result = ts.tv_sec; result *= nanoseconds_per_second; result += ts.tv_nsec; return result; } /* * Get current wall clock time in microseconds * * Used for the jitdump file header timestamp. Unlike monotonic time, * this represents actual wall clock time that can be correlated with * other system events. * * Returns: Current time in microseconds since Unix epoch */ static int64_t get_current_time_microseconds(void) { struct timeval tv; if (gettimeofday(&tv, NULL) < 0) { Py_UNREACHABLE(); // Should never fail on supported systems return 0; } return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; } // ============================================================================= // UTILITY FUNCTIONS // ============================================================================= /* * Round up a value to the next multiple of a given number * * This is essential for maintaining proper alignment requirements in the * jitdump format. Many structures need to be aligned to specific boundaries * (typically 8 or 16 bytes) for efficient processing by perf. * * Args: * value: The value to round up * multiple: The multiple to round up to * * Returns: The smallest value >= input that is a multiple of 'multiple' */ static size_t round_up(int64_t value, int64_t multiple) { if (multiple == 0) { return value; // Avoid division by zero } int64_t remainder = value % multiple; if (remainder == 0) { return value; // Already aligned } /* Calculate how much to add to reach the next multiple */ int64_t difference = multiple - remainder; int64_t rounded_up_value = value + difference; return rounded_up_value; } // ============================================================================= // FILE I/O UTILITIES // ============================================================================= /* * Write data to the jitdump file with error handling * * This function ensures that all data is written to the file, handling * partial writes that can occur with large buffers or when the system * is under load. * * Args: * buffer: Pointer to data to write * size: Number of bytes to write */ static void perf_map_jit_write_fully(const void* buffer, size_t size) { FILE* out_file = perf_jit_map_state.perf_map; const char* ptr = (const char*)(buffer); while (size > 0) { const size_t written = fwrite(ptr, 1, size, out_file); if (written == 0) { Py_UNREACHABLE(); // Write failure - should be very rare break; } size -= written; ptr += written; } } /* * Write the jitdump file header * * The header must be written exactly once at the beginning of each jitdump * file. It provides metadata that perf uses to parse the rest of the file. * * Args: * pid: Process ID to include in the header * out_file: File handle to write to (currently unused, uses global state) */ static void perf_map_jit_write_header(int pid, FILE* out_file) { Header header; /* Initialize header with required values */ header.magic = 0x4A695444; // "JiTD" magic number header.version = 1; // Current jitdump version header.size = sizeof(Header); // Header size for validation header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture header.process_id = pid; // Process identifier header.time_stamp = get_current_time_microseconds(); // Creation time header.flags = 0; // No special flags currently used perf_map_jit_write_fully(&header, sizeof(header)); } // ============================================================================= // DWARF CONSTANTS AND UTILITIES // ============================================================================= /* * DWARF (Debug With Arbitrary Record Formats) constants * * DWARF is a debugging data format used to provide stack unwinding information. * These constants define the various encoding types and opcodes used in * DWARF Call Frame Information (CFI) records. */ /* DWARF Call Frame Information version */ #define DWRF_CIE_VERSION 1 /* DWARF CFA (Call Frame Address) opcodes */ enum { DWRF_CFA_nop = 0x0, // No operation DWRF_CFA_offset_extended = 0x5, // Extended offset instruction DWRF_CFA_def_cfa = 0xc, // Define CFA rule DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset DWRF_CFA_advance_loc = 0x40, // Advance location counter DWRF_CFA_offset = 0x80 // Simple offset instruction }; /* DWARF Exception Handling pointer encodings */ enum { DWRF_EH_PE_absptr = 0x00, // Absolute pointer DWRF_EH_PE_omit = 0xff, // Omitted value /* Data type encodings */ DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte DWRF_EH_PE_signed = 0x08, // Signed flag /* Reference type encodings */ DWRF_EH_PE_pcrel = 0x10, // PC-relative DWRF_EH_PE_textrel = 0x20, // Text-relative DWRF_EH_PE_datarel = 0x30, // Data-relative DWRF_EH_PE_funcrel = 0x40, // Function-relative DWRF_EH_PE_aligned = 0x50, // Aligned DWRF_EH_PE_indirect = 0x80 // Indirect }; /* Additional DWARF constants for debug information */ enum { DWRF_TAG_compile_unit = 0x11 }; enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; enum { DWRF_AT_name = 0x03, // Name attribute DWRF_AT_stmt_list = 0x10, // Statement list DWRF_AT_low_pc = 0x11, // Low PC address DWRF_AT_high_pc = 0x12 // High PC address }; enum { DWRF_FORM_addr = 0x01, // Address form DWRF_FORM_data4 = 0x06, // 4-byte data DWRF_FORM_string = 0x08 // String form }; /* Line number program opcodes */ enum { DWRF_LNS_extended_op = 0, // Extended opcode DWRF_LNS_copy = 1, // Copy operation DWRF_LNS_advance_pc = 2, // Advance program counter DWRF_LNS_advance_line = 3 // Advance line number }; /* Line number extended opcodes */ enum { DWRF_LNE_end_sequence = 1, // End of sequence DWRF_LNE_set_address = 2 // Set address }; /* * Architecture-specific DWARF register numbers * * These constants define the register numbering scheme used by DWARF * for each supported architecture. The numbers must match the ABI * specification for proper stack unwinding. */ enum { #ifdef __x86_64__ /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ DWRF_REG_AX, // RAX DWRF_REG_DX, // RDX DWRF_REG_CX, // RCX DWRF_REG_BX, // RBX DWRF_REG_SI, // RSI DWRF_REG_DI, // RDI DWRF_REG_BP, // RBP DWRF_REG_SP, // RSP DWRF_REG_8, // R8 DWRF_REG_9, // R9 DWRF_REG_10, // R10 DWRF_REG_11, // R11 DWRF_REG_12, // R12 DWRF_REG_13, // R13 DWRF_REG_14, // R14 DWRF_REG_15, // R15 DWRF_REG_RA, // Return address (RIP) #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) /* AArch64 register numbering */ DWRF_REG_FP = 29, // Frame Pointer DWRF_REG_RA = 30, // Link register (return address) DWRF_REG_SP = 31, // Stack pointer #else # error "Unsupported target architecture" #endif }; /* DWARF encoding constants used in EH frame headers */ static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding // ============================================================================= // ELF OBJECT CONTEXT // ============================================================================= /* * Context for building ELF/DWARF structures * * This structure maintains state while constructing DWARF unwind information. * It acts as a simple buffer manager with pointers to track current position * and important landmarks within the buffer. */ typedef struct ELFObjectContext { uint8_t* p; // Current write position in buffer uint8_t* startp; // Start of buffer (for offset calculations) uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) uint32_t code_size; // Size of the code being described } ELFObjectContext; /* * EH Frame Header structure for DWARF unwinding * * This structure provides metadata about the DWARF unwinding information * that follows. It's required by the perf jitdump format to enable proper * stack unwinding during profiling. */ typedef struct { unsigned char version; // EH frame version (always 1) unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer unsigned char fde_count_enc; // Encoding of FDE count unsigned char table_enc; // Encoding of table entries int32_t eh_frame_ptr; // Pointer to EH frame data int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) int32_t from; // Start address of code range int32_t to; // End address of code range } EhFrameHeader; // ============================================================================= // DWARF GENERATION UTILITIES // ============================================================================= /* * Append a null-terminated string to the ELF context buffer * * Args: * ctx: ELF object context * str: String to append (must be null-terminated) * * Returns: Offset from start of buffer where string was written */ static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { uint8_t* p = ctx->p; uint32_t ofs = (uint32_t)(p - ctx->startp); /* Copy string including null terminator */ do { *p++ = (uint8_t)*str; } while (*str++); ctx->p = p; return ofs; } /* * Append a SLEB128 (Signed Little Endian Base 128) value * * SLEB128 is a variable-length encoding used extensively in DWARF. * It efficiently encodes small numbers in fewer bytes. * * Args: * ctx: ELF object context * v: Signed value to encode */ static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { uint8_t* p = ctx->p; /* Encode 7 bits at a time, with continuation bit in MSB */ for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit } *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit ctx->p = p; } /* * Append a ULEB128 (Unsigned Little Endian Base 128) value * * Similar to SLEB128 but for unsigned values. * * Args: * ctx: ELF object context * v: Unsigned value to encode */ static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { uint8_t* p = ctx->p; /* Encode 7 bits at a time, with continuation bit in MSB */ for (; v >= 0x80; v >>= 7) { *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit } *p++ = (char)v; // Final byte without continuation bit ctx->p = p; } /* * Macros for generating DWARF structures * * These macros provide a convenient way to write various data types * to the DWARF buffer while automatically advancing the pointer. */ #define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit #define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit #define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit #define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit #define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address #define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 #define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 #define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string /* Align to specified boundary with NOP instructions */ #define DWRF_ALIGNNOP(s) \ while ((uintptr_t)p & ((s)-1)) { \ *p++ = DWRF_CFA_nop; \ } /* Write a DWARF section with automatic size calculation */ #define DWRF_SECTION(name, stmt) \ { \ uint32_t* szp_##name = (uint32_t*)p; \ p += 4; \ stmt; \ *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ } // ============================================================================= // DWARF EH FRAME GENERATION // ============================================================================= /* * Initialize DWARF .eh_frame section for a code region * * The .eh_frame section contains Call Frame Information (CFI) that describes * how to unwind the stack at any point in the code. This is essential for * proper profiling as it allows perf to generate accurate call graphs. * * The function generates two main components: * 1. CIE (Common Information Entry) - describes calling conventions * 2. FDE (Frame Description Entry) - describes specific function unwinding * * Args: * ctx: ELF object context containing code size and buffer pointers */ static void elf_init_ehframe(ELFObjectContext* ctx) { uint8_t* p = ctx->p; uint8_t* framep = p; // Remember start of frame data /* * DWARF Unwind Table for Trampoline Function * * This section defines DWARF Call Frame Information (CFI) using encoded macros * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function * preserves and restores registers. This is used by profiling tools (e.g., `perf`) * and debuggers for stack unwinding in JIT-compiled code. * * ------------------------------------------------- * TO REGENERATE THIS TABLE FROM GCC OBJECTS: * ------------------------------------------------- * * 1. Create a trampoline source file (e.g., `trampoline.c`): * * #include * typedef PyObject* (*py_evaluator)(void*, void*, int); * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { * return evaluator(ts, f, throwflag); * } * * 2. Compile to an object file with frame pointer preservation: * * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c * * 3. Extract DWARF unwind info from the object file: * * readelf -w trampoline.o * * Example output from `.eh_frame`: * * 00000000 CIE * Version: 1 * Augmentation: "zR" * Code alignment factor: 4 * Data alignment factor: -8 * Return address column: 30 * DW_CFA_def_cfa: r31 (sp) ofs 0 * * 00000014 FDE cie=00000000 pc=0..14 * DW_CFA_advance_loc: 4 * DW_CFA_def_cfa_offset: 16 * DW_CFA_offset: r29 at cfa-16 * DW_CFA_offset: r30 at cfa-8 * DW_CFA_advance_loc: 12 * DW_CFA_restore: r30 * DW_CFA_restore: r29 * DW_CFA_def_cfa_offset: 0 * * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. * * ---------------------------------- * HOW TO TRANSLATE TO DWRF_* MACROS: * ---------------------------------- * * After compiling your trampoline with: * * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c * * run: * * readelf -w trampoline.o * * to inspect the generated `.eh_frame` data. You will see two main components: * * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. * 2. An FDE (Frame Description Entry): function-specific unwind instructions. * * --------------------- * Translating the CIE: * --------------------- * From `readelf -w`, you might see: * * 00000000 0000000000000010 00000000 CIE * Version: 1 * Augmentation: "zR" * Code alignment factor: 4 * Data alignment factor: -8 * Return address column: 30 * Augmentation data: 1b * DW_CFA_def_cfa: r31 (sp) ofs 0 * * Map this to: * * DWRF_SECTION(CIE, * DWRF_U32(0); // CIE ID (always 0 for CIEs) * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 * DWRF_STR("zR"); // Augmentation string "zR" * DWRF_UV(4); // Code alignment factor = 4 * DWRF_SV(-8); // Data alignment factor = -8 * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) * DWRF_UV(1); // Augmentation data length = 1 * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers * * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) * DWRF_UV(0); // Offset = 0 * * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary * ) * * Notes: * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. * * --------------------- * Translating the FDE: * --------------------- * From `readelf -w`: * * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 * DW_CFA_advance_loc: 4 * DW_CFA_def_cfa_offset: 16 * DW_CFA_offset: r29 at cfa-16 * DW_CFA_offset: r30 at cfa-8 * DW_CFA_advance_loc: 12 * DW_CFA_restore: r30 * DW_CFA_restore: r29 * DW_CFA_def_cfa_offset: 0 * * Map the FDE header and instructions to: * * DWRF_SECTION(FDE, * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) * DWRF_U32(-0x30); // Initial PC-relative location of the code * DWRF_U32(ctx->code_size); // Code range covered by this FDE * DWRF_U8(0); // Augmentation data length (none) * * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 * DWRF_UV(16); * * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) * DWRF_UV(2); // At offset 2 * 8 = 16 bytes * * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) * DWRF_UV(1); // At offset 1 * 8 = 8 bytes * * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) * * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 * * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP * DWRF_UV(0); * ) * * To regenerate: * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as * the code is in a different address space every time. * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. */ /* * Emit DWARF EH CIE (Common Information Entry) * * The CIE describes the calling conventions and basic unwinding rules * that apply to all functions in this compilation unit. */ DWRF_SECTION(CIE, DWRF_U32(0); // CIE ID (0 indicates this is a CIE) DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) DWRF_UV(1); // Code alignment factor DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) DWRF_U8(DWRF_REG_RA); // Return address register number DWRF_UV(1); // Augmentation data length DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding /* Initial CFI instructions - describe default calling convention */ DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) DWRF_UV(DWRF_REG_SP); // CFA = SP register DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved DWRF_UV(1); // At offset 1 from CFA DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary ) ctx->eh_frame_p = p; // Remember start of FDE data /* * Emit DWARF EH FDE (Frame Description Entry) * * The FDE describes unwinding information specific to this function. * It references the CIE and provides function-specific CFI instructions. */ DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) DWRF_U32(-0x30); // Machine code offset relative to .text DWRF_U32(ctx->code_size); // Address range covered by this FDE (code lenght) DWRF_U8(0); // Augmentation data length (none) /* * Architecture-specific CFI instructions * * These instructions describe how registers are saved and restored * during function calls. Each architecture has different calling * conventions and register usage patterns. */ #ifdef __x86_64__ /* x86_64 calling convention unwinding rules */ # if defined(__CET__) && (__CET__ & 1) DWRF_U8(DWRF_CFA_advance_loc | 8); // Advance location by 8 bytes when CET protection is enabled # else DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance location by 4 bytes # endif DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset DWRF_UV(16); // New offset: SP + 16 DWRF_U8(DWRF_CFA_advance_loc | 6); // Advance location by 6 bytes DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset DWRF_UV(8); // New offset: SP + 8 #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) /* AArch64 calling convention unwinding rules */ DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 instruction (stp x29, x30) DWRF_U8(DWRF_CFA_def_cfa_offset); // Redefine CFA offset DWRF_UV(16); // CFA = SP + 16 (stack pointer after push) DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Frame pointer (x29) saved DWRF_UV(2); // At offset 2 from CFA (2 * 8 = 16 bytes) DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Link register (x30) saved DWRF_UV(1); // At offset 1 from CFA (1 * 8 = 8 bytes) DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (mov x16, x3; mov x29, sp; ldp...) DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore frame pointer (x29) DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore link register (x30) DWRF_U8(DWRF_CFA_def_cfa_offset); // Final CFA adjustment DWRF_UV(0); // CFA = SP + 0 (stack restored) #else # error "Unsupported target architecture" #endif DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary ) ctx->p = p; // Update context pointer to end of generated data } // ============================================================================= // JITDUMP INITIALIZATION // ============================================================================= /* * Initialize the perf jitdump interface * * This function sets up everything needed to generate jitdump files: * 1. Creates the jitdump file with a unique name * 2. Maps the first page to signal perf that we're using the interface * 3. Writes the jitdump header * 4. Initializes synchronization primitives * * The memory mapping is crucial - perf detects jitdump files by scanning * for processes that have mapped files matching the pattern /tmp/jit-*.dump * * Returns: Pointer to initialized state, or NULL on failure */ static void* perf_map_jit_init(void) { char filename[100]; int pid = getpid(); /* Create unique filename based on process ID */ snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); /* Create/open the jitdump file with appropriate permissions */ const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); if (fd == -1) { return NULL; // Failed to create file } /* Get system page size for memory mapping */ const long page_size = sysconf(_SC_PAGESIZE); if (page_size == -1) { close(fd); return NULL; // Failed to get page size } /* * Map the first page of the jitdump file * * This memory mapping serves as a signal to perf that this process * is generating JIT code. Perf scans /proc/.../maps looking for mapped * files that match the jitdump naming pattern. * * The mapping must be PROT_READ | PROT_EXEC to be detected by perf. */ perf_jit_map_state.mapped_buffer = mmap( NULL, // Let kernel choose address page_size, // Map one page PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf) MAP_PRIVATE, // Private mapping fd, // File descriptor 0 // Offset 0 (first page) ); if (perf_jit_map_state.mapped_buffer == NULL) { close(fd); return NULL; // Memory mapping failed } perf_jit_map_state.mapped_size = page_size; /* Convert file descriptor to FILE* for easier I/O operations */ perf_jit_map_state.perf_map = fdopen(fd, "w+"); if (perf_jit_map_state.perf_map == NULL) { close(fd); return NULL; // Failed to create FILE* } /* * Set up file buffering for better performance * * We use a large buffer (2MB) because jitdump files can be written * frequently during program execution. Buffering reduces system call * overhead and improves overall performance. */ setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); /* Write the jitdump file header */ perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); /* * Initialize thread synchronization lock * * Multiple threads may attempt to write to the jitdump file * simultaneously. This lock ensures thread-safe access to the * global jitdump state. */ perf_jit_map_state.map_lock = PyThread_allocate_lock(); if (perf_jit_map_state.map_lock == NULL) { fclose(perf_jit_map_state.perf_map); return NULL; // Failed to create lock } /* Initialize code ID counter */ perf_jit_map_state.code_id = 0; /* Configure trampoline API with padding information */ trampoline_api.code_padding = PERF_JIT_CODE_PADDING; return &perf_jit_map_state; } // ============================================================================= // MAIN JITDUMP ENTRY WRITING // ============================================================================= /* * Write a complete jitdump entry for a Python function * * This is the main function called by Python's trampoline system whenever * a new piece of JIT-compiled code needs to be recorded. It writes both * the unwinding information and the code load event to the jitdump file. * * The function performs these steps: * 1. Initialize jitdump system if not already done * 2. Extract function name and filename from Python code object * 3. Generate DWARF unwinding information * 4. Write unwinding info event to jitdump file * 5. Write code load event to jitdump file * * Args: * state: Jitdump state (currently unused, uses global state) * code_addr: Address where the compiled code resides * code_size: Size of the compiled code in bytes * co: Python code object containing metadata * * IMPORTANT: This function signature is part of Python's internal API * and must not be changed without coordinating with core Python development. */ static void perf_map_jit_write_entry(void *state, const void *code_addr, unsigned int code_size, PyCodeObject *co) { /* Initialize jitdump system on first use */ if (perf_jit_map_state.perf_map == NULL) { void* ret = perf_map_jit_init(); if(ret == NULL){ return; // Initialization failed, silently abort } } /* * Extract function information from Python code object * * We create a human-readable function name by combining the qualified * name (includes class/module context) with the filename. This helps * developers identify functions in perf reports. */ const char *entry = ""; if (co->co_qualname != NULL) { entry = PyUnicode_AsUTF8(co->co_qualname); } const char *filename = ""; if (co->co_filename != NULL) { filename = PyUnicode_AsUTF8(co->co_filename); } /* * Create formatted function name for perf display * * Format: "py:::" * The "py::" prefix helps identify Python functions in mixed-language * profiles (e.g., when profiling C extensions alongside Python code). */ size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); if (perf_map_entry == NULL) { return; // Memory allocation failed } snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); const size_t name_length = strlen(perf_map_entry); uword base = (uword)code_addr; uword size = code_size; /* * Generate DWARF unwinding information * * DWARF data is essential for proper stack unwinding during profiling. * Without it, perf cannot generate accurate call graphs, especially * in optimized code where frame pointers may be omitted. */ ELFObjectContext ctx; char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) ctx.code_size = code_size; ctx.startp = ctx.p = (uint8_t*)buffer; /* Generate EH frame (Exception Handling frame) data */ elf_init_ehframe(&ctx); int eh_frame_size = ctx.p - ctx.startp; /* * Write Code Unwinding Information Event * * This event must be written before the code load event to ensure * perf has the unwinding information available when it processes * the code region. */ CodeUnwindingInfoEvent ev2; ev2.base.event = PerfUnwindingInfo; ev2.base.time_stamp = get_current_monotonic_ticks(); ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; /* Verify we don't exceed our padding budget */ assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING); ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment /* Calculate total event size with padding */ int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; int padding_size = round_up(content_size, 8) - content_size; // 8-byte align ev2.base.size = content_size + padding_size; /* Write the unwinding info event header */ perf_map_jit_write_fully(&ev2, sizeof(ev2)); /* * Write EH Frame Header * * The EH frame header provides metadata about the DWARF unwinding * information that follows. It includes pointers and counts that * help perf navigate the unwinding data efficiently. */ EhFrameHeader f; f.version = 1; f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte /* Calculate relative offsets for EH frame navigation */ f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); f.eh_fde_count = 1; // We generate exactly one FDE per function f.from = -(round_up(code_size, 8) + eh_frame_size); int cie_size = ctx.eh_frame_p - ctx.startp; f.to = -(eh_frame_size - cie_size); /* Write EH frame data and header */ perf_map_jit_write_fully(ctx.startp, eh_frame_size); perf_map_jit_write_fully(&f, sizeof(f)); /* Write padding to maintain alignment */ char padding_bytes[] = "\0\0\0\0\0\0\0\0"; perf_map_jit_write_fully(&padding_bytes, padding_size); /* * Write Code Load Event * * This event tells perf about the new code region. It includes: * - Memory addresses and sizes * - Process and thread identification * - Function name for symbol resolution * - The actual machine code bytes */ CodeLoadEvent ev; ev.base.event = PerfLoad; ev.base.size = sizeof(ev) + (name_length+1) + size; ev.base.time_stamp = get_current_monotonic_ticks(); ev.process_id = getpid(); ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call ev.vma = base; // Virtual memory address ev.code_address = base; // Same as VMA for our use case ev.code_size = size; /* Assign unique code ID and increment counter */ perf_jit_map_state.code_id += 1; ev.code_id = perf_jit_map_state.code_id; /* Write code load event and associated data */ perf_map_jit_write_fully(&ev, sizeof(ev)); perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code /* Clean up allocated memory */ PyMem_RawFree(perf_map_entry); } // ============================================================================= // CLEANUP AND FINALIZATION // ============================================================================= /* * Finalize and cleanup the perf jitdump system * * This function is called when Python is shutting down or when the * perf trampoline system is being disabled. It ensures all resources * are properly released and all buffered data is flushed to disk. * * Args: * state: Jitdump state (currently unused, uses global state) * * Returns: 0 on success * * IMPORTANT: This function signature is part of Python's internal API * and must not be changed without coordinating with core Python development. */ static int perf_map_jit_fini(void* state) { /* * Close jitdump file with proper synchronization * * We need to acquire the lock to ensure no other threads are * writing to the file when we close it. This prevents corruption * and ensures all data is properly flushed. */ if (perf_jit_map_state.perf_map != NULL) { PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); fclose(perf_jit_map_state.perf_map); // This also flushes buffers PyThread_release_lock(perf_jit_map_state.map_lock); /* Clean up synchronization primitive */ PyThread_free_lock(perf_jit_map_state.map_lock); perf_jit_map_state.perf_map = NULL; } /* * Unmap the memory region * * This removes the signal to perf that we were generating JIT code. * After this point, perf will no longer detect this process as * having JIT capabilities. */ if (perf_jit_map_state.mapped_buffer != NULL) { munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); perf_jit_map_state.mapped_buffer = NULL; } /* Clear global state reference */ trampoline_api.state = NULL; return 0; // Success } // ============================================================================= // PUBLIC API EXPORT // ============================================================================= /* * Python Perf Callbacks Structure * * This structure defines the callback interface that Python's trampoline * system uses to integrate with perf profiling. It contains function * pointers for initialization, event writing, and cleanup. * * CRITICAL: This structure and its contents are part of Python's internal * API. The function signatures and behavior must remain stable to maintain * compatibility with the Python interpreter's perf integration system. * * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h */ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { &perf_map_jit_init, // Initialization function &perf_map_jit_write_entry, // Event writing function &perf_map_jit_fini, // Cleanup function }; #endif /* PY_HAVE_PERF_TRAMPOLINE */