cpython/Python/optimizer.c
Guido van Rossum 11731434df
gh-104584: Emit macro expansions to opcode_metadata.h (#106163)
This produces longer traces (superblocks?).

Also improved debug output (uop names are now printed instead of numeric opcodes). This would be simpler if the numeric opcode values were generated by generate_cases.py, but that's another project.

Refactored some code in generate_cases.py so the essential algorithm for cache effects is only run once. (Deciding which effects are used and what the total cache size is, regardless of what's used.)
2023-06-28 18:28:07 +00:00

482 lines
15 KiB
C

#include "Python.h"
#include "opcode.h"
#include "pycore_interp.h"
#include "pycore_opcode.h"
#include "opcode_metadata.h"
#include "pycore_pystate.h"
#include "pycore_uops.h"
#include "cpython/optimizer.h"
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
static bool
has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
{
if (instr->op.code == ENTER_EXECUTOR) {
return true;
}
if (code->co_executors == NULL) {
return true;
}
return code->co_executors->size < 256;
}
static int32_t
get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
{
if (instr->op.code == ENTER_EXECUTOR) {
return instr->op.arg;
}
_PyExecutorArray *old = code->co_executors;
int size = 0;
int capacity = 0;
if (old != NULL) {
size = old->size;
capacity = old->capacity;
assert(size < 256);
}
assert(size <= capacity);
if (size == capacity) {
/* Array is full. Grow array */
int new_capacity = capacity ? capacity * 2 : 4;
_PyExecutorArray *new = PyMem_Realloc(
old,
offsetof(_PyExecutorArray, executors) +
new_capacity * sizeof(_PyExecutorObject *));
if (new == NULL) {
return -1;
}
new->capacity = new_capacity;
new->size = size;
code->co_executors = new;
}
assert(size < code->co_executors->capacity);
return size;
}
static void
insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
{
Py_INCREF(executor);
if (instr->op.code == ENTER_EXECUTOR) {
assert(index == instr->op.arg);
_PyExecutorObject *old = code->co_executors->executors[index];
executor->vm_data.opcode = old->vm_data.opcode;
executor->vm_data.oparg = old->vm_data.oparg;
old->vm_data.opcode = 0;
code->co_executors->executors[index] = executor;
Py_DECREF(old);
}
else {
assert(code->co_executors->size == index);
assert(code->co_executors->capacity > index);
executor->vm_data.opcode = instr->op.code;
executor->vm_data.oparg = instr->op.arg;
code->co_executors->executors[index] = executor;
assert(index < 256);
instr->op.code = ENTER_EXECUTOR;
instr->op.arg = index;
code->co_executors->size++;
}
return;
}
int
PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *new)
{
if (instr->op.code != ENTER_EXECUTOR) {
PyErr_Format(PyExc_ValueError, "No executor to replace");
return -1;
}
int index = instr->op.arg;
assert(index >= 0);
insert_executor(code, instr, index, new);
return 0;
}
static int
error_optimize(
_PyOptimizerObject* self,
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec)
{
PyErr_Format(PyExc_SystemError, "Should never call error_optimize");
return -1;
}
static PyTypeObject DefaultOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "noop_optimizer",
.tp_basicsize = sizeof(_PyOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
};
_PyOptimizerObject _PyOptimizer_Default = {
PyObject_HEAD_INIT(&DefaultOptimizer_Type)
.optimize = error_optimize,
.resume_threshold = UINT16_MAX,
.backedge_threshold = UINT16_MAX,
};
_PyOptimizerObject *
PyUnstable_GetOptimizer(void)
{
PyInterpreterState *interp = PyInterpreterState_Get();
if (interp->optimizer == &_PyOptimizer_Default) {
return NULL;
}
assert(interp->optimizer_backedge_threshold == interp->optimizer->backedge_threshold);
assert(interp->optimizer_resume_threshold == interp->optimizer->resume_threshold);
Py_INCREF(interp->optimizer);
return interp->optimizer;
}
void
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
{
PyInterpreterState *interp = PyInterpreterState_Get();
if (optimizer == NULL) {
optimizer = &_PyOptimizer_Default;
}
_PyOptimizerObject *old = interp->optimizer;
Py_INCREF(optimizer);
interp->optimizer = optimizer;
interp->optimizer_backedge_threshold = optimizer->backedge_threshold;
interp->optimizer_resume_threshold = optimizer->resume_threshold;
Py_DECREF(old);
}
_PyInterpreterFrame *
_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer)
{
PyCodeObject *code = (PyCodeObject *)frame->f_executable;
assert(PyCode_Check(code));
PyInterpreterState *interp = PyInterpreterState_Get();
if (!has_space_for_executor(code, src)) {
goto jump_to_destination;
}
_PyOptimizerObject *opt = interp->optimizer;
_PyExecutorObject *executor = NULL;
int err = opt->optimize(opt, code, dest, &executor);
if (err <= 0) {
assert(executor == NULL);
if (err < 0) {
return NULL;
}
goto jump_to_destination;
}
int index = get_index_for_executor(code, src);
if (index < 0) {
/* Out of memory. Don't raise and assume that the
* error will show up elsewhere.
*
* If an optimizer has already produced an executor,
* it might get confused by the executor disappearing,
* but there is not much we can do about that here. */
Py_DECREF(executor);
goto jump_to_destination;
}
insert_executor(code, src, index, executor);
assert(frame->prev_instr == src);
return executor->execute(executor, frame, stack_pointer);
jump_to_destination:
frame->prev_instr = dest - 1;
_PyFrame_SetStackPointer(frame, stack_pointer);
return frame;
}
/** Test support **/
typedef struct {
_PyOptimizerObject base;
int64_t count;
} _PyCounterOptimizerObject;
typedef struct {
_PyExecutorObject executor;
_PyCounterOptimizerObject *optimizer;
_Py_CODEUNIT *next_instr;
} _PyCounterExecutorObject;
static void
counter_dealloc(_PyCounterExecutorObject *self) {
Py_DECREF(self->optimizer);
PyObject_Free(self);
}
static PyTypeObject CounterExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "counting_executor",
.tp_basicsize = sizeof(_PyCounterExecutorObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_dealloc = (destructor)counter_dealloc,
};
static _PyInterpreterFrame *
counter_execute(_PyExecutorObject *self, _PyInterpreterFrame *frame, PyObject **stack_pointer)
{
((_PyCounterExecutorObject *)self)->optimizer->count++;
_PyFrame_SetStackPointer(frame, stack_pointer);
frame->prev_instr = ((_PyCounterExecutorObject *)self)->next_instr - 1;
Py_DECREF(self);
return frame;
}
static int
counter_optimize(
_PyOptimizerObject* self,
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr)
{
_PyCounterExecutorObject *executor = (_PyCounterExecutorObject *)_PyObject_New(&CounterExecutor_Type);
if (executor == NULL) {
return -1;
}
executor->executor.execute = counter_execute;
Py_INCREF(self);
executor->optimizer = (_PyCounterOptimizerObject *)self;
executor->next_instr = instr;
*exec_ptr = (_PyExecutorObject *)executor;
return 1;
}
static PyObject *
counter_get_counter(PyObject *self, PyObject *args)
{
return PyLong_FromLongLong(((_PyCounterOptimizerObject *)self)->count);
}
static PyMethodDef counter_methods[] = {
{ "get_count", counter_get_counter, METH_NOARGS, NULL },
{ NULL, NULL },
};
static PyTypeObject CounterOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "Counter optimizer",
.tp_basicsize = sizeof(_PyCounterOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_methods = counter_methods,
};
PyObject *
PyUnstable_Optimizer_NewCounter(void)
{
_PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&CounterOptimizer_Type);
if (opt == NULL) {
return NULL;
}
opt->base.optimize = counter_optimize;
opt->base.resume_threshold = UINT16_MAX;
opt->base.backedge_threshold = 0;
opt->count = 0;
return (PyObject *)opt;
}
///////////////////// Experimental UOp Optimizer /////////////////////
#ifdef Py_DEBUG
/* For debugging the interpreter: */
# define LLTRACE 1 /* Low-level trace feature */
#endif
static void
uop_dealloc(_PyUOpExecutorObject *self) {
PyObject_Free(self);
}
static PyTypeObject UOpExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_executor",
.tp_basicsize = sizeof(_PyUOpExecutorObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_dealloc = (destructor)uop_dealloc,
};
static int
translate_bytecode_to_trace(
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyUOpInstruction *trace,
int max_length)
{
#ifdef LLTRACE
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
int lltrace = 0;
if (uop_debug != NULL && *uop_debug >= '0') {
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 4) {
fprintf(stderr,
"Optimizing %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#define ADD_TO_TRACE(OPCODE, OPERAND) \
if (lltrace >= 2) { \
const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)]; \
fprintf(stderr, " ADD_TO_TRACE(%s, %" PRIu64 ")\n", opname, (uint64_t)(OPERAND)); \
} \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
#else
#define ADD_TO_TRACE(OPCODE, OPERAND) \
trace[trace_length].opcode = (OPCODE); \
trace[trace_length].operand = (OPERAND); \
trace_length++;
#endif
int trace_length = 0;
// Always reserve space for one uop, plus SET_UP, plus EXIT_TRACE
while (trace_length + 3 <= max_length) {
int opcode = instr->op.code;
uint64_t operand = instr->op.arg;
switch (opcode) {
case LOAD_FAST_LOAD_FAST:
{
// Reserve space for two uops (+ SETUP + EXIT_TRACE)
if (trace_length + 4 > max_length) {
goto done;
}
uint64_t oparg1 = operand >> 4;
uint64_t oparg2 = operand & 15;
ADD_TO_TRACE(LOAD_FAST, oparg1);
ADD_TO_TRACE(LOAD_FAST, oparg2);
break;
}
default:
{
const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
if (expansion->nuops > 0) {
// Reserve space for nuops (+ SETUP + EXIT_TRACE)
int nuops = expansion->nuops;
if (trace_length + nuops + 2 > max_length) {
goto done;
}
for (int i = 0; i < nuops; i++) {
int offset = expansion->uops[i].offset;
switch (expansion->uops[i].size) {
case 0:
break;
case 1:
operand = read_u16(&instr[offset].cache);
break;
case 2:
operand = read_u32(&instr[offset].cache);
break;
case 4:
operand = read_u64(&instr[offset].cache);
break;
default:
fprintf(stderr,
"opcode=%d, operand=%" PRIu64 "; nuops=%d, i=%d; size=%d, offset=%d\n",
opcode, operand, nuops, i,
expansion->uops[i].size,
expansion->uops[i].offset);
Py_FatalError("garbled expansion");
}
ADD_TO_TRACE(expansion->uops[i].uop, operand);
assert(expansion->uops[0].size == 0); // TODO
}
break;
}
// fprintf(stderr, "Unsupported opcode %d\n", opcode);
goto done; // Break out of while loop
}
}
instr++;
// Add cache size for opcode
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
ADD_TO_TRACE(SET_IP, (int)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
done:
if (trace_length > 0) {
ADD_TO_TRACE(EXIT_TRACE, 0);
#ifdef LLTRACE
if (lltrace >= 1) {
fprintf(stderr,
"Created a trace for %s (%s:%d) at offset %ld -- length %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive),
trace_length);
}
#endif
}
else {
#ifdef LLTRACE
if (lltrace >= 4) {
fprintf(stderr,
"No trace for %s (%s:%d) at offset %ld\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
}
#endif
}
return trace_length;
#undef ADD_TO_TRACE
}
static int
uop_optimize(
_PyOptimizerObject *self,
PyCodeObject *code,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr)
{
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH);
if (trace_length <= 0) {
// Error or nothing translated
return trace_length;
}
OBJECT_STAT_INC(optimization_traces_created);
_PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)_PyObject_New(&UOpExecutor_Type);
if (executor == NULL) {
return -1;
}
executor->base.execute = _PyUopExecute;
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
*exec_ptr = (_PyExecutorObject *)executor;
return 1;
}
static PyTypeObject UOpOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_optimizer",
.tp_basicsize = sizeof(_PyOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
};
PyObject *
PyUnstable_Optimizer_NewUOpOptimizer(void)
{
_PyOptimizerObject *opt = (_PyOptimizerObject *)_PyObject_New(&UOpOptimizer_Type);
if (opt == NULL) {
return NULL;
}
opt->optimize = uop_optimize;
opt->resume_threshold = UINT16_MAX;
// Need at least 3 iterations to settle specializations.
// A few lower bits of the counter are reserved for other flags.
opt->backedge_threshold = 3 << OPTIMIZER_BITS_IN_COUNTER;
return (PyObject *)opt;
}