From 58a359e585d0281ecab4d34cab9869e7eb4e4ca3 Mon Sep 17 00:00:00 2001 From: David Rowley Date: Sat, 28 Dec 2024 12:20:42 +1300 Subject: [PATCH] Speedup tuple deformation with additional function inlining This adjusts slot_deform_heap_tuple() to add special-case loops to eliminate much of the branching that was done within the body of the main deform loop. Previously, while looping over each attribute to deform, slot_deform_heap_tuple() would always recheck if the given attribute was NULL by looking at HeapTupleHasNulls() and if so, went on to check the tuple's NULL bitmap. Since many tuples won't contain any NULLs, we can just check HeapTupleHasNulls() once and when there are no NULLs, use a more compact version of the deforming loop which contains no NULL checking code at all. The same is possible for the "slow" mode checking part of the loop. That variable was checked several times for each attribute, once to determine if the offset to the attribute value could be taken from the attcacheoff, and again to check if the offset could be cached for next time. These "slow" checks can mostly be eliminated by instead having multiple loops. Initially, we can start in the non-slow loop and break out of that loop if and only if we must stop caching the offset. This eliminates branching for both slow and non-slow deforming methods. The amount of code required for the no nulls / non-slow version is very small. It's possible to have separate loops like this due to the fact that once we move into slow mode, we never need to switch back into non-slow mode for a given tuple. We have the compiler take care of writing out the multiple required loops by having a pg_attribute_always_inline function which gets called various times passing in constant values for the "slow" and "hasnulls" parameters. This allows the compiler to eliminate const-false branches and remove comparisons for const-true ones. This commit has shown overall query performance increases of around 5-20% in deform-heavy OLAP-type workloads. Author: David Rowley Reviewed-by: Victor Yegorov Discussion: https://postgr.es/m/CAGnEbog92Og2CpC2S8=g_HozGsWtt_3kRS1sXjLz0jKSoCNfLw@mail.gmail.com Discussion: https://postgr.es/m/CAApHDvo9e0XG71WrefYaRv5n4xNPLK4k8LjD0mSR3c9KR2vi2Q@mail.gmail.com --- src/backend/executor/execTuples.c | 210 ++++++++++++++++++++++-------- 1 file changed, 155 insertions(+), 55 deletions(-) diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c index 5d81c812673..a57d82b0495 100644 --- a/src/backend/executor/execTuples.c +++ b/src/backend/executor/execTuples.c @@ -990,6 +990,118 @@ tts_buffer_heap_store_tuple(TupleTableSlot *slot, HeapTuple tuple, } } +/* + * slot_deform_heap_tuple_internal + * An always inline helper function for use in slot_deform_heap_tuple to + * allow the compiler to emit specialized versions of this function for + * various combinations of "slow" and "hasnulls". For example, if a + * given tuple has no nulls, then we needn't check "hasnulls" for every + * attribute that we're deforming. The caller can just call this + * function with hasnulls set to constant-false and have the compiler + * remove the constant-false branches and emit more optimal code. + * + * Returns the next attnum to deform, which can be equal to natts when the + * function manages to deform all requested attributes. *offp is an input and + * output parameter which is the byte offset within the tuple to start deforming + * from which, on return, gets set to the offset where the next attribute + * should be deformed from. *slowp is set to true when subsequent deforming + * of this tuple must use a version of this function with "slow" passed as + * true. + * + * Callers cannot assume when we return "attnum" (i.e. all requested + * attributes have been deformed) that slow mode isn't required for any + * additional deforming as the final attribute may have caused a switch to + * slow mode. + */ +static pg_attribute_always_inline int +slot_deform_heap_tuple_internal(TupleTableSlot *slot, HeapTuple tuple, + int attnum, int natts, bool slow, + bool hasnulls, uint32 *offp, bool *slowp) +{ + TupleDesc tupleDesc = slot->tts_tupleDescriptor; + Datum *values = slot->tts_values; + bool *isnull = slot->tts_isnull; + HeapTupleHeader tup = tuple->t_data; + char *tp; /* ptr to tuple data */ + bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */ + bool slownext = false; + + tp = (char *) tup + tup->t_hoff; + + for (; attnum < natts; attnum++) + { + CompactAttribute *thisatt = TupleDescCompactAttr(tupleDesc, attnum); + + if (hasnulls && att_isnull(attnum, bp)) + { + values[attnum] = (Datum) 0; + isnull[attnum] = true; + if (!slow) + { + *slowp = true; + return attnum + 1; + } + else + continue; + } + + isnull[attnum] = false; + + /* calculate the offset of this attribute */ + if (!slow && thisatt->attcacheoff >= 0) + *offp = thisatt->attcacheoff; + else if (thisatt->attlen == -1) + { + /* + * We can only cache the offset for a varlena attribute if the + * offset is already suitably aligned, so that there would be no + * pad bytes in any case: then the offset will be valid for either + * an aligned or unaligned value. + */ + if (!slow && *offp == att_nominal_alignby(*offp, thisatt->attalignby)) + thisatt->attcacheoff = *offp; + else + { + *offp = att_pointer_alignby(*offp, + thisatt->attalignby, + -1, + tp + *offp); + + if (!slow) + slownext = true; + } + } + else + { + /* not varlena, so safe to use att_nominal_alignby */ + *offp = att_nominal_alignby(*offp, thisatt->attalignby); + + if (!slow) + thisatt->attcacheoff = *offp; + } + + values[attnum] = fetchatt(thisatt, tp + *offp); + + *offp = att_addlength_pointer(*offp, thisatt->attlen, tp + *offp); + + /* check if we need to switch to slow mode */ + if (!slow) + { + /* + * We're unable to deform any further if the above code set + * 'slownext', or if this isn't a fixed-width attribute. + */ + if (slownext || thisatt->attlen <= 0) + { + *slowp = true; + return attnum + 1; + } + } + } + + return natts; +} + /* * slot_deform_heap_tuple * Given a TupleTableSlot, extract data from the slot's physical tuple @@ -1008,15 +1120,9 @@ static pg_attribute_always_inline void slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp, int natts) { - TupleDesc tupleDesc = slot->tts_tupleDescriptor; - Datum *values = slot->tts_values; - bool *isnull = slot->tts_isnull; - HeapTupleHeader tup = tuple->t_data; bool hasnulls = HeapTupleHasNulls(tuple); int attnum; - char *tp; /* ptr to tuple data */ uint32 off; /* offset in tuple data */ - bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */ bool slow; /* can we use/set attcacheoff? */ /* We can only fetch as many attributes as the tuple has. */ @@ -1040,57 +1146,52 @@ slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp, slow = TTS_SLOW(slot); } - tp = (char *) tup + tup->t_hoff; - - for (; attnum < natts; attnum++) + /* + * If 'slow' isn't set, try deforming using deforming code that does not + * contain any of the extra checks required for non-fixed offset + * deforming. During deforming, if or when we find a NULL or a variable + * length attribute, we'll switch to a deforming method which includes the + * extra code required for non-fixed offset deforming, a.k.a slow mode. + * Because this is performance critical, we inline + * slot_deform_heap_tuple_internal passing the 'slow' and 'hasnull' + * parameters as constants to allow the compiler to emit specialized code + * with the known-const false comparisons and subsequent branches removed. + */ + if (!slow) { - CompactAttribute *thisatt = TupleDescCompactAttr(tupleDesc, attnum); - - if (hasnulls && att_isnull(attnum, bp)) - { - values[attnum] = (Datum) 0; - isnull[attnum] = true; - slow = true; /* can't use attcacheoff anymore */ - continue; - } - - isnull[attnum] = false; - - if (!slow && thisatt->attcacheoff >= 0) - off = thisatt->attcacheoff; - else if (thisatt->attlen == -1) - { - /* - * We can only cache the offset for a varlena attribute if the - * offset is already suitably aligned, so that there would be no - * pad bytes in any case: then the offset will be valid for either - * an aligned or unaligned value. - */ - if (!slow && - off == att_nominal_alignby(off, thisatt->attalignby)) - thisatt->attcacheoff = off; - else - { - off = att_pointer_alignby(off, thisatt->attalignby, -1, - tp + off); - slow = true; - } - } + /* Tuple without any NULLs? We can skip doing any NULL checking */ + if (!hasnulls) + attnum = slot_deform_heap_tuple_internal(slot, + tuple, + attnum, + natts, + false, /* slow */ + false, /* hasnulls */ + &off, + &slow); else - { - /* not varlena, so safe to use att_nominal_alignby */ - off = att_nominal_alignby(off, thisatt->attalignby); + attnum = slot_deform_heap_tuple_internal(slot, + tuple, + attnum, + natts, + false, /* slow */ + true, /* hasnulls */ + &off, + &slow); + } - if (!slow) - thisatt->attcacheoff = off; - } - - values[attnum] = fetchatt(thisatt, tp + off); - - off = att_addlength_pointer(off, thisatt->attlen, tp + off); - - if (thisatt->attlen <= 0) - slow = true; /* can't use attcacheoff anymore */ + /* If there's still work to do then we must be in slow mode */ + if (attnum < natts) + { + /* XXX is it worth adding a separate call when hasnulls is false? */ + attnum = slot_deform_heap_tuple_internal(slot, + tuple, + attnum, + natts, + true, /* slow */ + hasnulls, + &off, + &slow); } /* @@ -1104,7 +1205,6 @@ slot_deform_heap_tuple(TupleTableSlot *slot, HeapTuple tuple, uint32 *offp, slot->tts_flags &= ~TTS_FLAG_SLOW; } - const TupleTableSlotOps TTSOpsVirtual = { .base_slot_size = sizeof(VirtualTupleTableSlot), .init = tts_virtual_init,