This produces longer traces (superblocks?). Also improved debug output (uop names are now printed instead of numeric opcodes). This would be simpler if the numeric opcode values were generated by generate_cases.py, but that's another project. Refactored some code in generate_cases.py so the essential algorithm for cache effects is only run once. (Deciding which effects are used and what the total cache size is, regardless of what's used.)
482 lines
15 KiB
C
482 lines
15 KiB
C
|
|
#include "Python.h"
|
|
#include "opcode.h"
|
|
#include "pycore_interp.h"
|
|
#include "pycore_opcode.h"
|
|
#include "opcode_metadata.h"
|
|
#include "pycore_pystate.h"
|
|
#include "pycore_uops.h"
|
|
#include "cpython/optimizer.h"
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
|
|
static bool
|
|
has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
|
|
{
|
|
if (instr->op.code == ENTER_EXECUTOR) {
|
|
return true;
|
|
}
|
|
if (code->co_executors == NULL) {
|
|
return true;
|
|
}
|
|
return code->co_executors->size < 256;
|
|
}
|
|
|
|
static int32_t
|
|
get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
|
|
{
|
|
if (instr->op.code == ENTER_EXECUTOR) {
|
|
return instr->op.arg;
|
|
}
|
|
_PyExecutorArray *old = code->co_executors;
|
|
int size = 0;
|
|
int capacity = 0;
|
|
if (old != NULL) {
|
|
size = old->size;
|
|
capacity = old->capacity;
|
|
assert(size < 256);
|
|
}
|
|
assert(size <= capacity);
|
|
if (size == capacity) {
|
|
/* Array is full. Grow array */
|
|
int new_capacity = capacity ? capacity * 2 : 4;
|
|
_PyExecutorArray *new = PyMem_Realloc(
|
|
old,
|
|
offsetof(_PyExecutorArray, executors) +
|
|
new_capacity * sizeof(_PyExecutorObject *));
|
|
if (new == NULL) {
|
|
return -1;
|
|
}
|
|
new->capacity = new_capacity;
|
|
new->size = size;
|
|
code->co_executors = new;
|
|
}
|
|
assert(size < code->co_executors->capacity);
|
|
return size;
|
|
}
|
|
|
|
static void
|
|
insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
|
|
{
|
|
Py_INCREF(executor);
|
|
if (instr->op.code == ENTER_EXECUTOR) {
|
|
assert(index == instr->op.arg);
|
|
_PyExecutorObject *old = code->co_executors->executors[index];
|
|
executor->vm_data.opcode = old->vm_data.opcode;
|
|
executor->vm_data.oparg = old->vm_data.oparg;
|
|
old->vm_data.opcode = 0;
|
|
code->co_executors->executors[index] = executor;
|
|
Py_DECREF(old);
|
|
}
|
|
else {
|
|
assert(code->co_executors->size == index);
|
|
assert(code->co_executors->capacity > index);
|
|
executor->vm_data.opcode = instr->op.code;
|
|
executor->vm_data.oparg = instr->op.arg;
|
|
code->co_executors->executors[index] = executor;
|
|
assert(index < 256);
|
|
instr->op.code = ENTER_EXECUTOR;
|
|
instr->op.arg = index;
|
|
code->co_executors->size++;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int
|
|
PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *new)
|
|
{
|
|
if (instr->op.code != ENTER_EXECUTOR) {
|
|
PyErr_Format(PyExc_ValueError, "No executor to replace");
|
|
return -1;
|
|
}
|
|
int index = instr->op.arg;
|
|
assert(index >= 0);
|
|
insert_executor(code, instr, index, new);
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
error_optimize(
|
|
_PyOptimizerObject* self,
|
|
PyCodeObject *code,
|
|
_Py_CODEUNIT *instr,
|
|
_PyExecutorObject **exec)
|
|
{
|
|
PyErr_Format(PyExc_SystemError, "Should never call error_optimize");
|
|
return -1;
|
|
}
|
|
|
|
static PyTypeObject DefaultOptimizer_Type = {
|
|
PyVarObject_HEAD_INIT(&PyType_Type, 0)
|
|
.tp_name = "noop_optimizer",
|
|
.tp_basicsize = sizeof(_PyOptimizerObject),
|
|
.tp_itemsize = 0,
|
|
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
|
};
|
|
|
|
_PyOptimizerObject _PyOptimizer_Default = {
|
|
PyObject_HEAD_INIT(&DefaultOptimizer_Type)
|
|
.optimize = error_optimize,
|
|
.resume_threshold = UINT16_MAX,
|
|
.backedge_threshold = UINT16_MAX,
|
|
};
|
|
|
|
_PyOptimizerObject *
|
|
PyUnstable_GetOptimizer(void)
|
|
{
|
|
PyInterpreterState *interp = PyInterpreterState_Get();
|
|
if (interp->optimizer == &_PyOptimizer_Default) {
|
|
return NULL;
|
|
}
|
|
assert(interp->optimizer_backedge_threshold == interp->optimizer->backedge_threshold);
|
|
assert(interp->optimizer_resume_threshold == interp->optimizer->resume_threshold);
|
|
Py_INCREF(interp->optimizer);
|
|
return interp->optimizer;
|
|
}
|
|
|
|
void
|
|
PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer)
|
|
{
|
|
PyInterpreterState *interp = PyInterpreterState_Get();
|
|
if (optimizer == NULL) {
|
|
optimizer = &_PyOptimizer_Default;
|
|
}
|
|
_PyOptimizerObject *old = interp->optimizer;
|
|
Py_INCREF(optimizer);
|
|
interp->optimizer = optimizer;
|
|
interp->optimizer_backedge_threshold = optimizer->backedge_threshold;
|
|
interp->optimizer_resume_threshold = optimizer->resume_threshold;
|
|
Py_DECREF(old);
|
|
}
|
|
|
|
_PyInterpreterFrame *
|
|
_PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer)
|
|
{
|
|
PyCodeObject *code = (PyCodeObject *)frame->f_executable;
|
|
assert(PyCode_Check(code));
|
|
PyInterpreterState *interp = PyInterpreterState_Get();
|
|
if (!has_space_for_executor(code, src)) {
|
|
goto jump_to_destination;
|
|
}
|
|
_PyOptimizerObject *opt = interp->optimizer;
|
|
_PyExecutorObject *executor = NULL;
|
|
int err = opt->optimize(opt, code, dest, &executor);
|
|
if (err <= 0) {
|
|
assert(executor == NULL);
|
|
if (err < 0) {
|
|
return NULL;
|
|
}
|
|
goto jump_to_destination;
|
|
}
|
|
int index = get_index_for_executor(code, src);
|
|
if (index < 0) {
|
|
/* Out of memory. Don't raise and assume that the
|
|
* error will show up elsewhere.
|
|
*
|
|
* If an optimizer has already produced an executor,
|
|
* it might get confused by the executor disappearing,
|
|
* but there is not much we can do about that here. */
|
|
Py_DECREF(executor);
|
|
goto jump_to_destination;
|
|
}
|
|
insert_executor(code, src, index, executor);
|
|
assert(frame->prev_instr == src);
|
|
return executor->execute(executor, frame, stack_pointer);
|
|
jump_to_destination:
|
|
frame->prev_instr = dest - 1;
|
|
_PyFrame_SetStackPointer(frame, stack_pointer);
|
|
return frame;
|
|
}
|
|
|
|
/** Test support **/
|
|
|
|
|
|
typedef struct {
|
|
_PyOptimizerObject base;
|
|
int64_t count;
|
|
} _PyCounterOptimizerObject;
|
|
|
|
typedef struct {
|
|
_PyExecutorObject executor;
|
|
_PyCounterOptimizerObject *optimizer;
|
|
_Py_CODEUNIT *next_instr;
|
|
} _PyCounterExecutorObject;
|
|
|
|
static void
|
|
counter_dealloc(_PyCounterExecutorObject *self) {
|
|
Py_DECREF(self->optimizer);
|
|
PyObject_Free(self);
|
|
}
|
|
|
|
static PyTypeObject CounterExecutor_Type = {
|
|
PyVarObject_HEAD_INIT(&PyType_Type, 0)
|
|
.tp_name = "counting_executor",
|
|
.tp_basicsize = sizeof(_PyCounterExecutorObject),
|
|
.tp_itemsize = 0,
|
|
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
|
.tp_dealloc = (destructor)counter_dealloc,
|
|
};
|
|
|
|
static _PyInterpreterFrame *
|
|
counter_execute(_PyExecutorObject *self, _PyInterpreterFrame *frame, PyObject **stack_pointer)
|
|
{
|
|
((_PyCounterExecutorObject *)self)->optimizer->count++;
|
|
_PyFrame_SetStackPointer(frame, stack_pointer);
|
|
frame->prev_instr = ((_PyCounterExecutorObject *)self)->next_instr - 1;
|
|
Py_DECREF(self);
|
|
return frame;
|
|
}
|
|
|
|
static int
|
|
counter_optimize(
|
|
_PyOptimizerObject* self,
|
|
PyCodeObject *code,
|
|
_Py_CODEUNIT *instr,
|
|
_PyExecutorObject **exec_ptr)
|
|
{
|
|
_PyCounterExecutorObject *executor = (_PyCounterExecutorObject *)_PyObject_New(&CounterExecutor_Type);
|
|
if (executor == NULL) {
|
|
return -1;
|
|
}
|
|
executor->executor.execute = counter_execute;
|
|
Py_INCREF(self);
|
|
executor->optimizer = (_PyCounterOptimizerObject *)self;
|
|
executor->next_instr = instr;
|
|
*exec_ptr = (_PyExecutorObject *)executor;
|
|
return 1;
|
|
}
|
|
|
|
static PyObject *
|
|
counter_get_counter(PyObject *self, PyObject *args)
|
|
{
|
|
return PyLong_FromLongLong(((_PyCounterOptimizerObject *)self)->count);
|
|
}
|
|
|
|
static PyMethodDef counter_methods[] = {
|
|
{ "get_count", counter_get_counter, METH_NOARGS, NULL },
|
|
{ NULL, NULL },
|
|
};
|
|
|
|
static PyTypeObject CounterOptimizer_Type = {
|
|
PyVarObject_HEAD_INIT(&PyType_Type, 0)
|
|
.tp_name = "Counter optimizer",
|
|
.tp_basicsize = sizeof(_PyCounterOptimizerObject),
|
|
.tp_itemsize = 0,
|
|
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
|
.tp_methods = counter_methods,
|
|
};
|
|
|
|
PyObject *
|
|
PyUnstable_Optimizer_NewCounter(void)
|
|
{
|
|
_PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&CounterOptimizer_Type);
|
|
if (opt == NULL) {
|
|
return NULL;
|
|
}
|
|
opt->base.optimize = counter_optimize;
|
|
opt->base.resume_threshold = UINT16_MAX;
|
|
opt->base.backedge_threshold = 0;
|
|
opt->count = 0;
|
|
return (PyObject *)opt;
|
|
}
|
|
|
|
///////////////////// Experimental UOp Optimizer /////////////////////
|
|
|
|
#ifdef Py_DEBUG
|
|
/* For debugging the interpreter: */
|
|
# define LLTRACE 1 /* Low-level trace feature */
|
|
#endif
|
|
|
|
static void
|
|
uop_dealloc(_PyUOpExecutorObject *self) {
|
|
PyObject_Free(self);
|
|
}
|
|
|
|
static PyTypeObject UOpExecutor_Type = {
|
|
PyVarObject_HEAD_INIT(&PyType_Type, 0)
|
|
.tp_name = "uop_executor",
|
|
.tp_basicsize = sizeof(_PyUOpExecutorObject),
|
|
.tp_itemsize = 0,
|
|
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
|
.tp_dealloc = (destructor)uop_dealloc,
|
|
};
|
|
|
|
static int
|
|
translate_bytecode_to_trace(
|
|
PyCodeObject *code,
|
|
_Py_CODEUNIT *instr,
|
|
_PyUOpInstruction *trace,
|
|
int max_length)
|
|
{
|
|
#ifdef LLTRACE
|
|
char *uop_debug = Py_GETENV("PYTHONUOPSDEBUG");
|
|
int lltrace = 0;
|
|
if (uop_debug != NULL && *uop_debug >= '0') {
|
|
lltrace = *uop_debug - '0'; // TODO: Parse an int and all that
|
|
}
|
|
if (lltrace >= 4) {
|
|
fprintf(stderr,
|
|
"Optimizing %s (%s:%d) at offset %ld\n",
|
|
PyUnicode_AsUTF8(code->co_qualname),
|
|
PyUnicode_AsUTF8(code->co_filename),
|
|
code->co_firstlineno,
|
|
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
|
|
}
|
|
#define ADD_TO_TRACE(OPCODE, OPERAND) \
|
|
if (lltrace >= 2) { \
|
|
const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)]; \
|
|
fprintf(stderr, " ADD_TO_TRACE(%s, %" PRIu64 ")\n", opname, (uint64_t)(OPERAND)); \
|
|
} \
|
|
trace[trace_length].opcode = (OPCODE); \
|
|
trace[trace_length].operand = (OPERAND); \
|
|
trace_length++;
|
|
#else
|
|
#define ADD_TO_TRACE(OPCODE, OPERAND) \
|
|
trace[trace_length].opcode = (OPCODE); \
|
|
trace[trace_length].operand = (OPERAND); \
|
|
trace_length++;
|
|
#endif
|
|
|
|
int trace_length = 0;
|
|
// Always reserve space for one uop, plus SET_UP, plus EXIT_TRACE
|
|
while (trace_length + 3 <= max_length) {
|
|
int opcode = instr->op.code;
|
|
uint64_t operand = instr->op.arg;
|
|
switch (opcode) {
|
|
case LOAD_FAST_LOAD_FAST:
|
|
{
|
|
// Reserve space for two uops (+ SETUP + EXIT_TRACE)
|
|
if (trace_length + 4 > max_length) {
|
|
goto done;
|
|
}
|
|
uint64_t oparg1 = operand >> 4;
|
|
uint64_t oparg2 = operand & 15;
|
|
ADD_TO_TRACE(LOAD_FAST, oparg1);
|
|
ADD_TO_TRACE(LOAD_FAST, oparg2);
|
|
break;
|
|
}
|
|
default:
|
|
{
|
|
const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
|
|
if (expansion->nuops > 0) {
|
|
// Reserve space for nuops (+ SETUP + EXIT_TRACE)
|
|
int nuops = expansion->nuops;
|
|
if (trace_length + nuops + 2 > max_length) {
|
|
goto done;
|
|
}
|
|
for (int i = 0; i < nuops; i++) {
|
|
int offset = expansion->uops[i].offset;
|
|
switch (expansion->uops[i].size) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
operand = read_u16(&instr[offset].cache);
|
|
break;
|
|
case 2:
|
|
operand = read_u32(&instr[offset].cache);
|
|
break;
|
|
case 4:
|
|
operand = read_u64(&instr[offset].cache);
|
|
break;
|
|
default:
|
|
fprintf(stderr,
|
|
"opcode=%d, operand=%" PRIu64 "; nuops=%d, i=%d; size=%d, offset=%d\n",
|
|
opcode, operand, nuops, i,
|
|
expansion->uops[i].size,
|
|
expansion->uops[i].offset);
|
|
Py_FatalError("garbled expansion");
|
|
}
|
|
ADD_TO_TRACE(expansion->uops[i].uop, operand);
|
|
assert(expansion->uops[0].size == 0); // TODO
|
|
}
|
|
break;
|
|
}
|
|
// fprintf(stderr, "Unsupported opcode %d\n", opcode);
|
|
goto done; // Break out of while loop
|
|
}
|
|
}
|
|
instr++;
|
|
// Add cache size for opcode
|
|
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
|
|
ADD_TO_TRACE(SET_IP, (int)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
|
|
}
|
|
done:
|
|
if (trace_length > 0) {
|
|
ADD_TO_TRACE(EXIT_TRACE, 0);
|
|
#ifdef LLTRACE
|
|
if (lltrace >= 1) {
|
|
fprintf(stderr,
|
|
"Created a trace for %s (%s:%d) at offset %ld -- length %d\n",
|
|
PyUnicode_AsUTF8(code->co_qualname),
|
|
PyUnicode_AsUTF8(code->co_filename),
|
|
code->co_firstlineno,
|
|
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive),
|
|
trace_length);
|
|
}
|
|
#endif
|
|
}
|
|
else {
|
|
#ifdef LLTRACE
|
|
if (lltrace >= 4) {
|
|
fprintf(stderr,
|
|
"No trace for %s (%s:%d) at offset %ld\n",
|
|
PyUnicode_AsUTF8(code->co_qualname),
|
|
PyUnicode_AsUTF8(code->co_filename),
|
|
code->co_firstlineno,
|
|
(long)(instr - (_Py_CODEUNIT *)code->co_code_adaptive));
|
|
}
|
|
#endif
|
|
}
|
|
return trace_length;
|
|
|
|
#undef ADD_TO_TRACE
|
|
}
|
|
|
|
static int
|
|
uop_optimize(
|
|
_PyOptimizerObject *self,
|
|
PyCodeObject *code,
|
|
_Py_CODEUNIT *instr,
|
|
_PyExecutorObject **exec_ptr)
|
|
{
|
|
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
|
|
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH);
|
|
if (trace_length <= 0) {
|
|
// Error or nothing translated
|
|
return trace_length;
|
|
}
|
|
OBJECT_STAT_INC(optimization_traces_created);
|
|
_PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)_PyObject_New(&UOpExecutor_Type);
|
|
if (executor == NULL) {
|
|
return -1;
|
|
}
|
|
executor->base.execute = _PyUopExecute;
|
|
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
|
|
*exec_ptr = (_PyExecutorObject *)executor;
|
|
return 1;
|
|
}
|
|
|
|
static PyTypeObject UOpOptimizer_Type = {
|
|
PyVarObject_HEAD_INIT(&PyType_Type, 0)
|
|
.tp_name = "uop_optimizer",
|
|
.tp_basicsize = sizeof(_PyOptimizerObject),
|
|
.tp_itemsize = 0,
|
|
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
|
|
};
|
|
|
|
PyObject *
|
|
PyUnstable_Optimizer_NewUOpOptimizer(void)
|
|
{
|
|
_PyOptimizerObject *opt = (_PyOptimizerObject *)_PyObject_New(&UOpOptimizer_Type);
|
|
if (opt == NULL) {
|
|
return NULL;
|
|
}
|
|
opt->optimize = uop_optimize;
|
|
opt->resume_threshold = UINT16_MAX;
|
|
// Need at least 3 iterations to settle specializations.
|
|
// A few lower bits of the counter are reserved for other flags.
|
|
opt->backedge_threshold = 3 << OPTIMIZER_BITS_IN_COUNTER;
|
|
return (PyObject *)opt;
|
|
}
|