diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 27f88e0eb21..0e19795b232 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -16,6 +16,7 @@ #include "access/gin_private.h" #include "miscadmin.h" +#include "utils/memutils.h" #include "utils/rel.h" static void ginFindParents(GinBtree btree, GinBtreeStack *stack); @@ -309,15 +310,16 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack) * Insert a new item to a page. * * Returns true if the insertion was finished. On false, the page was split and - * the parent needs to be updated. (a root split returns true as it doesn't - * need any further action by the caller to complete) + * the parent needs to be updated. (A root split returns true as it doesn't + * need any further action by the caller to complete.) * - * When inserting a downlink to a internal page, 'childbuf' contains the + * When inserting a downlink to an internal page, 'childbuf' contains the * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared - * atomically with the insert. Also, the existing item at the given location - * is updated to point to 'updateblkno'. + * atomically with the insert. Also, the existing item at offset stack->off + * in the target page is updated to point to updateblkno. * * stack->buffer is locked on entry, and is kept locked. + * Likewise for childbuf, if given. */ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, @@ -325,12 +327,29 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, Buffer childbuf, GinStatsData *buildStats) { Page page = BufferGetPage(stack->buffer); - XLogRecData *payloadrdata; + bool result; GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; Page newlpage = NULL, newrpage = NULL; + void *ptp_workspace = NULL; + XLogRecData payloadrdata[10]; + MemoryContext tmpCxt; + MemoryContext oldCxt; + + /* + * We do all the work of this function and its subfunctions in a temporary + * memory context. This avoids leakages and simplifies APIs, since some + * subfunctions allocate storage that has to survive until we've finished + * the WAL insertion. + */ + tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "ginPlaceToPage temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldCxt = MemoryContextSwitchTo(tmpCxt); if (GinPageIsData(page)) xlflags |= GIN_INSERT_ISDATA; @@ -348,21 +367,36 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } /* - * Try to put the incoming tuple on the page. placeToPage will decide if - * the page needs to be split. + * See if the incoming tuple will fit on the page. beginPlaceToPage will + * decide if the page needs to be split, and will compute the split + * contents if so. See comments for beginPlaceToPage and execPlaceToPage + * functions for more details of the API here. */ - rc = btree->placeToPage(btree, stack->buffer, stack, - insertdata, updateblkno, - &payloadrdata, &newlpage, &newrpage); - if (rc == UNMODIFIED) - return true; - else if (rc == INSERTED) + rc = btree->beginPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + &ptp_workspace, + &newlpage, &newrpage, + payloadrdata); + + if (rc == GPTP_NO_WORK) { - /* placeToPage did START_CRIT_SECTION() */ + /* Nothing to do */ + result = true; + } + else if (rc == GPTP_INSERT) + { + /* It will fit, perform the insertion */ + START_CRIT_SECTION(); + + /* Perform the page update, and set up WAL data about it */ + btree->execPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + ptp_workspace, payloadrdata); + MarkBufferDirty(stack->buffer); /* An insert to an internal page finishes the split of the child. */ - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); @@ -387,7 +421,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, * Log information about child if this was an insertion of a * downlink. */ - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) { rdata[0].next = &rdata[1]; @@ -400,7 +434,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, rdata[1].next = &rdata[2]; rdata[2].buffer = childbuf; - rdata[2].buffer_std = false; + rdata[2].buffer_std = true; rdata[2].data = NULL; rdata[2].len = 0; rdata[2].next = payloadrdata; @@ -409,25 +443,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, rdata[0].next = payloadrdata; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); + PageSetLSN(page, recptr); - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); - return true; + /* Insertion is complete. */ + result = true; } - else if (rc == SPLIT) + else if (rc == GPTP_SPLIT) { - /* Didn't fit, have to split */ + /* + * Didn't fit, need to split. The split has been computed in newlpage + * and newrpage, which are pointers to palloc'd pages, not associated + * with buffers. stack->buffer is not touched yet. + */ Buffer rbuffer; BlockNumber savedRightLink; - XLogRecData rdata[2]; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; + /* Get a new index page to become the right page */ rbuffer = GinNewBuffer(btree->index); /* During index build, count the new page */ @@ -441,52 +481,27 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, savedRightLink = GinPageGetOpaque(page)->rightlink; - /* - * newlpage and newrpage are pointers to memory pages, not associated - * with buffers. stack->buffer is not touched yet. - */ - + /* Begin setting up WAL record (which we might not use) */ data.node = btree->index->rd_node; data.rblkno = BufferGetBlockNumber(rbuffer); data.flags = xlflags; - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) { - Page childpage = BufferGetPage(childbuf); - - GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; - data.leftChildBlkno = BufferGetBlockNumber(childbuf); data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; } else data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplit); - - if (childbuf != InvalidBuffer) - { - rdata[0].next = &rdata[1]; - - rdata[1].buffer = childbuf; - rdata[1].buffer_std = false; - rdata[1].data = NULL; - rdata[1].len = 0; - rdata[1].next = payloadrdata; - } - else - rdata[0].next = payloadrdata; - if (stack->parent == NULL) { /* - * split root, so we need to allocate new left page and place - * pointer on root to left and right page + * splitting the root, so we need to allocate new left page and + * place pointers to left and right page on root page. */ lbuffer = GinNewBuffer(btree->index); - /* During index build, count the newly-added root page */ + /* During index build, count the new left page */ if (buildStats) { if (btree->isData) @@ -508,9 +523,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, /* * Construct a new root page containing downlinks to the new left - * and right pages. (do this in a temporary copy first rather than - * overwriting the original page directly, so that we can still - * abort gracefully if this fails.) + * and right pages. (Do this in a temporary copy rather than + * overwriting the original page directly, since we're not in the + * critical section yet.) */ newrootpg = PageGetTempPage(newrpage); GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); @@ -521,7 +536,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } else { - /* split non-root page */ + /* splitting a non-root page */ data.rrlink = savedRightLink; data.lblkno = BufferGetBlockNumber(stack->buffer); @@ -531,48 +546,70 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } /* - * Ok, we have the new contents of the left page in a temporary copy - * now (newlpage), and the newly-allocated right block has been filled - * in. The original page is still unchanged. + * OK, we have the new contents of the left page in a temporary copy + * now (newlpage), and likewise for the new contents of the + * newly-allocated right block. The original page is still unchanged. * * If this is a root split, we also have a temporary page containing - * the new contents of the root. Copy the new left page to a - * newly-allocated block, and initialize the (original) root page the - * new copy. Otherwise, copy over the temporary copy of the new left - * page over the old left page. + * the new contents of the root. */ START_CRIT_SECTION(); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); - if (BufferIsValid(childbuf)) - MarkBufferDirty(childbuf); /* - * Restore the temporary copies over the real buffers. But don't free - * the temporary copies yet, WAL record data points to them. + * Restore the temporary copies over the real buffers. */ if (stack->parent == NULL) { + /* Splitting the root, three pages to update */ MarkBufferDirty(lbuffer); - memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ); + memcpy(page, newrootpg, BLCKSZ); memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } else { - memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ); + /* Normal split, only two pages to update */ + memcpy(page, newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } + /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ + if (BufferIsValid(childbuf)) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } + /* write WAL record */ if (RelationNeedsWAL(btree->index)) { + XLogRecData rdata[2]; XLogRecPtr recptr; + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogSplit); + + if (BufferIsValid(childbuf)) + { + rdata[0].next = &rdata[1]; + + rdata[1].buffer = childbuf; + rdata[1].buffer_std = true; + rdata[1].data = NULL; + rdata[1].len = 0; + rdata[1].next = payloadrdata; + } + else + rdata[0].next = payloadrdata; + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); - PageSetLSN(BufferGetPage(stack->buffer), recptr); + + PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) PageSetLSN(BufferGetPage(lbuffer), recptr); @@ -582,33 +619,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, END_CRIT_SECTION(); /* - * We can release the lock on the right page now, but keep the - * original buffer locked. + * We can release the locks/pins on the new pages now, but keep + * stack->buffer locked. childbuf doesn't get unlocked either. */ UnlockReleaseBuffer(rbuffer); if (stack->parent == NULL) UnlockReleaseBuffer(lbuffer); - pfree(newlpage); - pfree(newrpage); - if (newrootpg) - pfree(newrootpg); - /* * If we split the root, we're done. Otherwise the split is not * complete until the downlink for the new page has been inserted to * the parent. */ - if (stack->parent == NULL) - return true; - else - return false; + result = (stack->parent == NULL); } else { - elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc); - return false; /* keep compiler quiet */ + elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc); + result = false; /* keep compiler quiet */ } + + /* Clean up temp context */ + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + return result; } /* diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index e3ab6cfd0ee..209020992dc 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -18,7 +18,6 @@ #include "access/heapam_xlog.h" #include "lib/ilist.h" #include "miscadmin.h" -#include "utils/memutils.h" #include "utils/rel.h" /* @@ -57,6 +56,13 @@ typedef struct int rsize; /* total size on right page */ bool oldformat; /* page is in pre-9.4 format on disk */ + + /* + * If we need WAL data representing the reconstructed leaf page, it's + * stored here by computeLeafRecompressWALData. + */ + char *walinfo; /* buffer start */ + int walinfolen; /* and length */ } disassembledLeaf; typedef struct @@ -98,20 +104,18 @@ static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage); + Page *newlpage, Page *newrpage, XLogRecData *rdata); static disassembledLeaf *disassembleLeaf(Page page); static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); -static XLogRecData *constructLeafRecompressWALData(Buffer buf, - disassembledLeaf *leaf); +static void computeLeafRecompressWALData(disassembledLeaf *leaf); static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); -static void dataPlaceToPageLeafSplit(Buffer buf, - disassembledLeaf *leaf, +static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, - XLogRecData **prdata, Page lpage, Page rpage); + Page lpage, Page rpage, XLogRecData *rdata); /* * Read TIDs from leaf data page to single uncompressed array. The TIDs are @@ -424,12 +428,25 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset) } /* - * Places keys to leaf data page and fills WAL record. + * Prepare to insert data on a leaf data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. + * + * In neither case should the given page buffer be modified here. */ static GinPlaceToPageRC -dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, XLogRecData **prdata, - Page *newlpage, Page *newrpage) +dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) { GinBtreeDataLeafInsertData *items = insertdata; ItemPointer newItems = &items->items[items->curitem]; @@ -442,15 +459,11 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, bool append; int segsize; Size freespace; - MemoryContext tmpCxt; - MemoryContext oldCxt; disassembledLeaf *leaf; leafSegmentInfo *lastleftinfo; ItemPointerData maxOldItem; ItemPointerData remaining; - Assert(GinPageIsData(page)); - rbound = *GinDataPageGetRightBound(page); /* @@ -474,18 +487,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, maxitems = i; } - /* - * The following operations do quite a lot of small memory allocations, - * create a temporary memory context so that we don't need to keep track - * of them individually. - */ - tmpCxt = AllocSetContextCreate(CurrentMemoryContext, - "Gin split temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldCxt = MemoryContextSwitchTo(tmpCxt); - + /* Disassemble the data on the page */ leaf = disassembleLeaf(page); /* @@ -550,16 +552,13 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); } - /* Add the new items to the segments */ + /* Add the new items to the segment list */ if (!addItemsToLeaf(leaf, newItems, maxitems)) { /* all items were duplicates, we have nothing to do */ items->curitem += maxitems; - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(tmpCxt); - - return UNMODIFIED; + return GPTP_NO_WORK; } /* @@ -592,21 +591,17 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (!needsplit) { /* - * Great, all the items fit on a single page. Construct a WAL record - * describing the changes we made, and write the segments back to the - * page. - * - * Once we start modifying the page, there's no turning back. The - * caller is responsible for calling END_CRIT_SECTION() after writing - * the WAL record. + * Great, all the items fit on a single page. If needed, prepare data + * for a WAL record describing the changes we'll make. */ - MemoryContextSwitchTo(oldCxt); if (RelationNeedsWAL(btree->index)) - *prdata = constructLeafRecompressWALData(buf, leaf); - else - *prdata = NULL; - START_CRIT_SECTION(); - dataPlaceToPageLeafRecompress(buf, leaf); + computeLeafRecompressWALData(leaf); + + /* + * We're ready to enter the critical section, but + * dataExecPlaceToPageLeaf will need access to the "leaf" data. + */ + *ptp_workspace = leaf; if (append) elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", @@ -620,7 +615,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, else { /* - * Had to split. + * Have to split. * * leafRepackItems already divided the segments between the left and * the right page. It filled the left page as full as possible, and @@ -632,7 +627,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, * until they're balanced. * * As a further heuristic, when appending items to the end of the - * page, try make the left page 75% full, one the assumption that + * page, try to make the left page 75% full, on the assumption that * subsequent insertions will probably also go to the end. This packs * the index somewhat tighter when appending to a table, which is very * common. @@ -681,11 +676,14 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, &lastleftinfo->nitems); lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; - *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ); - *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ); + /* + * Now allocate a couple of temporary page images, and fill them. + */ + *newlpage = palloc(BLCKSZ); + *newrpage = palloc(BLCKSZ); - dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound, - prdata, *newlpage, *newrpage); + dataPlaceToPageLeafSplit(leaf, lbound, rbound, + *newlpage, *newrpage, rdata); Assert(GinPageRightMost(page) || ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), @@ -701,12 +699,37 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, items->nitem - items->curitem - maxitems); } - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(tmpCxt); - items->curitem += maxitems; - return needsplit ? SPLIT : INSERTED; + return needsplit ? GPTP_SPLIT : GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + */ +static void +dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, void *ptp_workspace, + XLogRecData *rdata) +{ + disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace; + + /* Apply changes to page */ + dataPlaceToPageLeafRecompress(buf, leaf); + + /* If needed, register WAL data built by computeLeafRecompressWALData */ + if (RelationNeedsWAL(btree->index)) + { + rdata[0].buffer = buf; + rdata[0].buffer_std = true; + rdata[0].data = leaf->walinfo; + rdata[0].len = leaf->walinfolen; + rdata[0].next = NULL; + } } /* @@ -791,7 +814,6 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) */ if (removedsomething) { - XLogRecData *payloadrdata = NULL; bool modified; /* @@ -818,8 +840,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) } if (RelationNeedsWAL(indexrel)) - payloadrdata = constructLeafRecompressWALData(buffer, leaf); + computeLeafRecompressWALData(leaf); + + /* Apply changes to page */ START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buffer, leaf); MarkBufferDirty(buffer); @@ -827,18 +852,24 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) if (RelationNeedsWAL(indexrel)) { XLogRecPtr recptr; - XLogRecData rdata; + XLogRecData rdata[2]; ginxlogVacuumDataLeafPage xlrec; xlrec.node = indexrel->rd_node; xlrec.blkno = BufferGetBlockNumber(buffer); - rdata.buffer = InvalidBuffer; - rdata.data = (char *) &xlrec; - rdata.len = offsetof(ginxlogVacuumDataLeafPage, data); - rdata.next = payloadrdata; + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &xlrec; + rdata[0].len = offsetof(ginxlogVacuumDataLeafPage, data); + rdata[0].next = &rdata[1]; - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata); + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + rdata[1].data = leaf->walinfo; + rdata[1].len = leaf->walinfolen; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, rdata); PageSetLSN(page, recptr); } @@ -848,15 +879,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) /* * Construct a ginxlogRecompressDataLeaf record representing the changes - * in *leaf. + * in *leaf. (Because this requires a palloc, we have to do it before + * we enter the critical section that actually updates the page.) */ -static XLogRecData * -constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) +static void +computeLeafRecompressWALData(disassembledLeaf *leaf) { int nmodified = 0; char *walbufbegin; char *walbufend; - XLogRecData *rdata; dlist_iter iter; int segno; ginxlogRecompressDataLeaf *recompress_xlog; @@ -871,12 +902,11 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) nmodified++; } - walbufbegin = palloc( - sizeof(ginxlogRecompressDataLeaf) + - BLCKSZ + /* max size needed to hold the segment - * data */ - nmodified * 2 + /* (segno + action) per action */ - sizeof(XLogRecData)); + walbufbegin = + palloc(sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment data */ + nmodified * 2 /* (segno + action) per action */ + ); walbufend = walbufbegin; recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; @@ -944,22 +974,15 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) segno++; } - rdata = (XLogRecData *) MAXALIGN(walbufend); - rdata->buffer = buf; - rdata->buffer_std = TRUE; - rdata->data = walbufbegin; - rdata->len = walbufend - walbufbegin; - rdata->next = NULL; - - return rdata; + /* Pass back the constructed info via *leaf */ + leaf->walinfo = walbufbegin; + leaf->walinfolen = walbufend - walbufbegin; } /* * Assemble a disassembled posting tree leaf page back to a buffer. * - * *prdata is filled with WAL information about this operation. The caller - * is responsible for inserting to the WAL, along with any other information - * about the operation that triggered this recompression. + * This just updates the target buffer; WAL stuff is caller's responsibility. * * NOTE: The segment pointers must not point directly to the same buffer, * except for segments that have not been modified and whose preceding @@ -1018,13 +1041,14 @@ dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) * segments to two pages instead of one. * * This is different from the non-split cases in that this does not modify - * the original page directly, but to temporary in-memory copies of the new - * left and right pages. + * the original page directly, but writes to temporary in-memory copies of + * the new left and right pages. Also, we prepare rdata[] entries for the + * data that must be appended to the WAL record. */ static void -dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, +dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, - XLogRecData **prdata, Page lpage, Page rpage) + Page lpage, Page rpage, XLogRecData *rdata) { char *ptr; int segsize; @@ -1034,9 +1058,8 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, dlist_node *firstright; leafSegmentInfo *seginfo; - /* these must be static so they can be returned to caller */ + /* this must be static so it can be returned to caller */ static ginxlogSplitDataLeaf split_xlog; - static XLogRecData rdata[3]; /* Initialize temporary pages to hold the new left and right pages */ GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); @@ -1113,44 +1136,64 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage); rdata[2].len = rsize; rdata[2].next = NULL; - - *prdata = rdata; } /* - * Place a PostingItem to page, and fill a WAL record. + * Prepare to insert data on an internal data page. * - * If the item doesn't fit, returns false without modifying the page. + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. * - * In addition to inserting the given item, the downlink of the existing item - * at 'off' is updated to point to 'updateblkno'. + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. */ static GinPlaceToPageRC -dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) +dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) +{ + Page page = BufferGetPage(buf); + + /* If it doesn't fit, deal with split case */ + if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) + { + dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, + newlpage, newrpage, rdata); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + */ +static void +dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace, + XLogRecData *rdata) { Page page = BufferGetPage(buf); OffsetNumber off = stack->off; PostingItem *pitem; - /* these must be static so they can be returned to caller */ - static XLogRecData rdata; - static ginxlogInsertDataInternal data; - - /* split if we have to */ - if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) - { - dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, - prdata, newlpage, newrpage); - return SPLIT; - } - - *prdata = &rdata; - Assert(GinPageIsData(page)); - - START_CRIT_SECTION(); - /* Update existing downlink to point to next page (on internal page) */ pitem = GinDataPageGetPostingItem(page, off); PostingItemSetBlockNumber(pitem, updateblkno); @@ -1159,50 +1202,106 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, pitem = (PostingItem *) insertdata; GinDataPageAddPostingItem(page, pitem, off); - data.offset = off; - data.newitem = *pitem; + if (RelationNeedsWAL(btree->index)) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertDataInternal data; - rdata.buffer = buf; - rdata.buffer_std = TRUE; - rdata.data = (char *) &data; - rdata.len = sizeof(ginxlogInsertDataInternal); - rdata.next = NULL; + data.offset = off; + data.newitem = *pitem; - return INSERTED; + rdata[0].buffer = buf; + rdata[0].buffer_std = true; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogInsertDataInternal); + rdata[0].next = NULL; + } } /* - * Places an item (or items) to a posting tree. Calls relevant function of - * internal of leaf page because they are handled very differently. + * Prepare to insert data on a posting-tree data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. */ static GinPlaceToPageRC -dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, - Page *newlpage, Page *newrpage) +dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) { Page page = BufferGetPage(buf); Assert(GinPageIsData(page)); if (GinPageIsLeaf(page)) - return dataPlaceToPageLeaf(btree, buf, stack, insertdata, - prdata, newlpage, newrpage); + return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, + newlpage, newrpage, rdata); else - return dataPlaceToPageInternal(btree, buf, stack, - insertdata, updateblkno, - prdata, newlpage, newrpage); + return dataBeginPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + ptp_workspace, + newlpage, newrpage, rdata); } /* - * Split page and fill WAL record. Returns a new temp buffer filled with data - * that should go to the left page. The original buffer is left untouched. + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static void +dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace, + XLogRecData *rdata) +{ + Page page = BufferGetPage(buf); + + if (GinPageIsLeaf(page)) + dataExecPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, rdata); + else + dataExecPlaceToPageInternal(btree, buf, stack, insertdata, + updateblkno, ptp_workspace, rdata); +} + +/* + * Split internal page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. */ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) + Page *newlpage, Page *newrpage, XLogRecData *rdata) { Page oldpage = BufferGetPage(origbuf); OffsetNumber off = stack->off; @@ -1218,7 +1317,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, /* these must be static so they can be returned to caller */ static ginxlogSplitDataInternal data; - static XLogRecData rdata[4]; static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; lpage = PageGetTempPage(oldpage); @@ -1226,8 +1324,6 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); - *prdata = rdata; - /* * First construct a new list of PostingItems, which includes all the old * items, and the new item. @@ -1277,6 +1373,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, /* set up right bound for right page */ *GinDataPageGetRightBound(rpage) = oldbound; + /* Set up WAL data */ data.separator = separator; data.nitem = nitems; data.rightbound = oldbound; @@ -1291,6 +1388,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, rdata[1].len = nitems * sizeof(PostingItem); rdata[1].next = NULL; + /* return temp pages to caller */ *newlpage = lpage; *newrpage = rpage; } @@ -1855,7 +1953,8 @@ ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) btree->isMoveRight = dataIsMoveRight; btree->findItem = NULL; btree->findChildPtr = dataFindChildPtr; - btree->placeToPage = dataPlaceToPage; + btree->beginPlaceToPage = dataBeginPlaceToPage; + btree->execPlaceToPage = dataExecPlaceToPage; btree->fillRoot = ginDataFillRoot; btree->prepareDownlink = dataPrepareDownlink; diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 412f90da4db..bdf1f2e5889 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -20,9 +20,10 @@ static void entrySplitPage(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, - void *insertPayload, - BlockNumber updateblkno, XLogRecData **prdata, - Page *newlpage, Page *newrpage); + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage, + XLogRecData *rdata); /* * Form a tuple for entry tree. @@ -507,40 +508,63 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off, } /* - * Place tuple on page and fills WAL record + * Prepare to insert data on an entry page. * - * If the tuple doesn't fit, returns false without modifying the page. + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. * - * On insertion to an internal node, in addition to inserting the given item, - * the downlink of the existing item at 'off' is updated to point to - * 'updateblkno'. + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. Also, + * if WAL logging is needed, fill one or more entries of rdata[] with + * whatever data must be appended to the WAL record. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. */ static GinPlaceToPageRC -entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertPayload, BlockNumber updateblkno, - XLogRecData **prdata, Page *newlpage, Page *newrpage) +entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; + + /* If it doesn't fit, deal with split case */ + if (!entryIsEnoughSpace(btree, buf, off, insertData)) + { + entrySplitPage(btree, buf, stack, insertData, updateblkno, + newlpage, newrpage, rdata); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section. It must modify the target + * buffer and store one or more XLogRecData records describing the changes + * in rdata[]. + */ +static void +entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void *ptp_workspace, + XLogRecData *rdata) { GinBtreeEntryInsertData *insertData = insertPayload; Page page = BufferGetPage(buf); OffsetNumber off = stack->off; OffsetNumber placed; - int cnt = 0; - /* these must be static so they can be returned to caller */ - static XLogRecData rdata[3]; - static ginxlogInsertEntry data; - - /* quick exit if it doesn't fit */ - if (!entryIsEnoughSpace(btree, buf, off, insertData)) - { - entrySplitPage(btree, buf, stack, insertPayload, updateblkno, - prdata, newlpage, newrpage); - return SPLIT; - } - - START_CRIT_SECTION(); - - *prdata = rdata; entryPreparePage(btree, page, off, insertData, updateblkno); placed = PageAddItem(page, @@ -551,39 +575,47 @@ entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); - data.isDelete = insertData->isDelete; - data.offset = off; + if (RelationNeedsWAL(btree->index)) + { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertEntry data; - rdata[cnt].buffer = buf; - rdata[cnt].buffer_std = true; - rdata[cnt].data = (char *) &data; - rdata[cnt].len = offsetof(ginxlogInsertEntry, tuple); - rdata[cnt].next = &rdata[cnt + 1]; - cnt++; + data.isDelete = insertData->isDelete; + data.offset = off; - rdata[cnt].buffer = buf; - rdata[cnt].buffer_std = true; - rdata[cnt].data = (char *) insertData->entry; - rdata[cnt].len = IndexTupleSize(insertData->entry); - rdata[cnt].next = NULL; + rdata[0].buffer = buf; + rdata[0].buffer_std = true; + rdata[0].data = (char *) &data; + rdata[0].len = offsetof(ginxlogInsertEntry, tuple); + rdata[0].next = &rdata[1]; - return INSERTED; + rdata[1].buffer = buf; + rdata[1].buffer_std = true; + rdata[1].data = (char *) insertData->entry; + rdata[1].len = IndexTupleSize(insertData->entry); + rdata[1].next = NULL; + } } /* - * Place tuple and split page, original buffer(lbuf) leaves untouched, - * returns shadow pages filled with new data. - * Tuples are distributed between pages by equal size on its, not - * an equal number! + * Split entry page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. + * Also, set up rdata[] entries describing data to be appended to WAL record. */ static void entrySplitPage(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, - void *insertPayload, - BlockNumber updateblkno, XLogRecData **prdata, - Page *newlpage, Page *newrpage) + GinBtreeEntryInsertData *insertData, + BlockNumber updateblkno, + Page *newlpage, Page *newrpage, + XLogRecData *rdata) { - GinBtreeEntryInsertData *insertData = insertPayload; OffsetNumber off = stack->off; OffsetNumber i, maxoff, @@ -600,11 +632,9 @@ entrySplitPage(GinBtree btree, Buffer origbuf, Size pageSize = PageGetPageSize(lpage); /* these must be static so they can be returned to caller */ - static XLogRecData rdata[2]; static ginxlogSplitEntry data; static char tupstore[2 * BLCKSZ]; - *prdata = rdata; entryPreparePage(btree, lpage, off, insertData, updateblkno); /* @@ -655,6 +685,10 @@ entrySplitPage(GinBtree btree, Buffer origbuf, { itup = (IndexTuple) ptr; + /* + * Decide where to split. We try to equalize the pages' total data + * size, not number of tuples. + */ if (lsize > totalsize / 2) { if (separator == InvalidOffsetNumber) @@ -685,6 +719,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf, rdata[1].len = tupstoresize; rdata[1].next = NULL; + /* return temp pages to caller */ *newlpage = lpage; *newrpage = rpage; } @@ -753,7 +788,8 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, btree->isMoveRight = entryIsMoveRight; btree->findItem = entryLocateLeafEntry; btree->findChildPtr = entryFindChildPtr; - btree->placeToPage = entryPlaceToPage; + btree->beginPlaceToPage = entryBeginPlaceToPage; + btree->execPlaceToPage = entryExecPlaceToPage; btree->fillRoot = ginEntryFillRoot; btree->prepareDownlink = entryPrepareDownlink; diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index a86a6cb57ef..e042f5ca1fd 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -656,12 +656,12 @@ typedef struct GinBtreeStack typedef struct GinBtreeData *GinBtree; -/* Return codes for GinBtreeData.placeToPage method */ +/* Return codes for GinBtreeData.beginPlaceToPage method */ typedef enum { - UNMODIFIED, - INSERTED, - SPLIT + GPTP_NO_WORK, + GPTP_INSERT, + GPTP_SPLIT } GinPlaceToPageRC; typedef struct GinBtreeData @@ -674,7 +674,8 @@ typedef struct GinBtreeData /* insert methods */ OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); - GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, XLogRecData **, Page *, Page *); + GinPlaceToPageRC (*beginPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void **, Page *, Page *, XLogRecData *); + void (*execPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void *, XLogRecData *); void *(*prepareDownlink) (GinBtree, Buffer); void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page);