2006-05-02 11:28:56 +00:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* ginxlog.c
|
|
|
|
* WAL replay logic for inverted index.
|
|
|
|
*
|
|
|
|
*
|
2014-01-07 16:05:30 -05:00
|
|
|
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
|
2006-05-02 11:28:56 +00:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/gin/ginxlog.c
|
2006-05-02 11:28:56 +00:00
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
Fix GIN to support null keys, empty and null items, and full index scans.
Per my recent proposal(s). Null key datums can now be returned by
extractValue and extractQuery functions, and will be stored in the index.
Also, placeholder entries are made for indexable items that are NULL or
contain no keys according to extractValue. This means that the index is
now always complete, having at least one entry for every indexed heap TID,
and so we can get rid of the prohibition on full-index scans. A full-index
scan is implemented much the same way as partial-match scans were already:
we build a bitmap representing all the TIDs found in the index, and then
drive the results off that.
Also, introduce a concept of a "search mode" that can be requested by
extractQuery when the operator requires matching to empty items (this is
just as cheap as matching to a single key) or requires a full index scan
(which is not so cheap, but it sure beats failing or giving wrong answers).
The behavior remains backward compatible for opclasses that don't return
any null keys or request a non-default search mode.
Using these features, we can now make the GIN index opclass for anyarray
behave in a way that matches the actual anyarray operators for &&, <@, @>,
and = ... which it failed to do before in assorted corner cases.
This commit fixes the core GIN code and ginarrayprocs.c, updates the
documentation, and adds some simple regression test cases for the new
behaviors using the array operators. The tsearch and contrib GIN opclass
support functions still need to be looked over and probably fixed.
Another thing I intend to fix separately is that this is pretty inefficient
for cases where more than one scan condition needs a full-index search:
we'll run duplicate GinScanEntrys, each one of which builds a large bitmap.
There is some existing logic to merge duplicate GinScanEntrys but it needs
refactoring to make it work for entries belonging to different scan keys.
Note that most of gin.h has been split out into a new file gin_private.h,
so that gin.h doesn't export anything that's not supposed to be used by GIN
opclasses or the rest of the backend. I did quite a bit of other code
beautification work as well, mostly fixing comments and choosing more
appropriate names for things.
2011-01-07 19:16:24 -05:00
|
|
|
#include "access/gin_private.h"
|
2008-05-12 00:00:54 +00:00
|
|
|
#include "access/xlogutils.h"
|
2006-05-02 11:28:56 +00:00
|
|
|
#include "utils/memutils.h"
|
|
|
|
|
2006-10-04 00:30:14 +00:00
|
|
|
static MemoryContext opCtx; /* working memory for operations */
|
2006-05-02 11:28:56 +00:00
|
|
|
|
|
|
|
static void
|
2013-11-27 19:21:23 +02:00
|
|
|
ginRedoClearIncompleteSplit(XLogRecPtr lsn, RelFileNode node, BlockNumber blkno)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
buffer = XLogReadBuffer(node, blkno, false);
|
|
|
|
if (!BufferIsValid(buffer))
|
|
|
|
return; /* page was deleted, nothing to do */
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
if (lsn > PageGetLSN(page))
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
GinPageGetOpaque(page)->flags &= ~GIN_INCOMPLETE_SPLIT;
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
2013-11-27 19:21:23 +02:00
|
|
|
|
|
|
|
UnlockReleaseBuffer(buffer);
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 00:30:14 +00:00
|
|
|
ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
2006-05-02 11:28:56 +00:00
|
|
|
RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
|
2009-06-11 14:49:15 +00:00
|
|
|
Buffer RootBuffer,
|
|
|
|
MetaBuffer;
|
2006-05-02 11:28:56 +00:00
|
|
|
Page page;
|
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
/* Backup blocks are not used in create_index records */
|
|
|
|
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
|
|
|
|
|
2009-03-24 20:17:18 +00:00
|
|
|
MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true);
|
|
|
|
Assert(BufferIsValid(MetaBuffer));
|
2010-10-11 19:04:37 -04:00
|
|
|
page = (Page) BufferGetPage(MetaBuffer);
|
|
|
|
|
2009-03-24 20:17:18 +00:00
|
|
|
GinInitMetabuffer(MetaBuffer);
|
|
|
|
|
|
|
|
PageSetLSN(page, lsn);
|
2010-10-11 19:04:37 -04:00
|
|
|
MarkBufferDirty(MetaBuffer);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2009-03-24 20:17:18 +00:00
|
|
|
RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true);
|
|
|
|
Assert(BufferIsValid(RootBuffer));
|
|
|
|
page = (Page) BufferGetPage(RootBuffer);
|
|
|
|
|
|
|
|
GinInitBuffer(RootBuffer, GIN_LEAF);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
|
|
|
PageSetLSN(page, lsn);
|
2009-03-24 20:17:18 +00:00
|
|
|
MarkBufferDirty(RootBuffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
|
2009-03-24 20:17:18 +00:00
|
|
|
UnlockReleaseBuffer(RootBuffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
UnlockReleaseBuffer(MetaBuffer);
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 00:30:14 +00:00
|
|
|
ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
|
|
|
ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record);
|
2014-01-22 18:51:48 +02:00
|
|
|
char *ptr;
|
2006-05-02 11:28:56 +00:00
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
/* Backup blocks are not used in create_ptree records */
|
|
|
|
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
|
|
|
|
|
2008-06-12 09:12:31 +00:00
|
|
|
buffer = XLogReadBuffer(data->node, data->blkno, true);
|
2006-05-02 11:28:56 +00:00
|
|
|
Assert(BufferIsValid(buffer));
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED);
|
|
|
|
|
|
|
|
ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree);
|
|
|
|
|
|
|
|
/* Place page data */
|
|
|
|
memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size);
|
|
|
|
|
2014-04-14 21:03:01 +03:00
|
|
|
GinDataPageSetDataSize(page, data->size);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2014-01-22 18:51:48 +02:00
|
|
|
ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata;
|
2014-01-22 18:51:48 +02:00
|
|
|
OffsetNumber offset = data->offset;
|
2013-11-27 19:21:23 +02:00
|
|
|
IndexTuple itup;
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
if (rightblkno != InvalidBlockNumber)
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
/* update link to right page after split */
|
|
|
|
Assert(!GinPageIsLeaf(page));
|
|
|
|
Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page));
|
|
|
|
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offset));
|
|
|
|
GinSetDownlink(itup, rightblkno);
|
|
|
|
}
|
2010-10-11 19:04:37 -04:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
if (data->isDelete)
|
|
|
|
{
|
|
|
|
Assert(GinPageIsLeaf(page));
|
|
|
|
Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page));
|
|
|
|
PageIndexTupleDelete(page, offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
itup = &data->tuple;
|
|
|
|
|
|
|
|
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
RelFileNode node;
|
2014-05-06 12:12:18 -04:00
|
|
|
ForkNumber forknum;
|
2013-11-27 19:21:23 +02:00
|
|
|
BlockNumber blknum;
|
2010-10-11 19:04:37 -04:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
BufferGetTag(buffer, &node, &forknum, &blknum);
|
|
|
|
elog(ERROR, "failed to add item to index page in %u/%u/%u",
|
|
|
|
node.spcNode, node.dbNode, node.relNode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2014-01-22 18:51:48 +02:00
|
|
|
ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
|
|
|
|
{
|
2014-03-31 15:15:19 +03:00
|
|
|
int actionno;
|
|
|
|
int segno;
|
|
|
|
GinPostingList *oldseg;
|
|
|
|
Pointer segmentend;
|
|
|
|
char *walbuf;
|
|
|
|
int totalsize;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the page is in pre-9.4 format, convert to new format first.
|
|
|
|
*/
|
|
|
|
if (!GinPageIsCompressed(page))
|
|
|
|
{
|
|
|
|
ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
|
|
|
|
int nuncompressed = GinPageGetOpaque(page)->maxoff;
|
|
|
|
int npacked;
|
|
|
|
GinPostingList *plist;
|
|
|
|
|
|
|
|
plist = ginCompressPostingList(uncompressed, nuncompressed,
|
|
|
|
BLCKSZ, &npacked);
|
|
|
|
Assert(npacked == nuncompressed);
|
|
|
|
|
|
|
|
totalsize = SizeOfGinPostingList(plist);
|
|
|
|
|
|
|
|
memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
|
2014-04-14 21:03:01 +03:00
|
|
|
GinDataPageSetDataSize(page, totalsize);
|
2014-03-31 15:15:19 +03:00
|
|
|
GinPageSetCompressed(page);
|
|
|
|
GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
|
|
|
|
}
|
|
|
|
|
|
|
|
oldseg = GinDataLeafPageGetPostingList(page);
|
|
|
|
segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
|
|
|
|
segno = 0;
|
|
|
|
|
|
|
|
walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
|
|
|
|
for (actionno = 0; actionno < data->nactions; actionno++)
|
|
|
|
{
|
|
|
|
uint8 a_segno = *((uint8 *) (walbuf++));
|
|
|
|
uint8 a_action = *((uint8 *) (walbuf++));
|
|
|
|
GinPostingList *newseg = NULL;
|
|
|
|
int newsegsize = 0;
|
|
|
|
ItemPointerData *items = NULL;
|
|
|
|
uint16 nitems = 0;
|
|
|
|
ItemPointerData *olditems;
|
|
|
|
int nolditems;
|
|
|
|
ItemPointerData *newitems;
|
|
|
|
int nnewitems;
|
|
|
|
int segsize;
|
|
|
|
Pointer segptr;
|
|
|
|
int szleft;
|
|
|
|
|
|
|
|
/* Extract all the information we need from the WAL record */
|
|
|
|
if (a_action == GIN_SEGMENT_INSERT ||
|
|
|
|
a_action == GIN_SEGMENT_REPLACE)
|
|
|
|
{
|
|
|
|
newseg = (GinPostingList *) walbuf;
|
|
|
|
newsegsize = SizeOfGinPostingList(newseg);
|
|
|
|
walbuf += SHORTALIGN(newsegsize);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (a_action == GIN_SEGMENT_ADDITEMS)
|
|
|
|
{
|
|
|
|
memcpy(&nitems, walbuf, sizeof(uint16));
|
|
|
|
walbuf += sizeof(uint16);
|
|
|
|
items = (ItemPointerData *) walbuf;
|
|
|
|
walbuf += nitems * sizeof(ItemPointerData);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Skip to the segment that this action concerns */
|
|
|
|
Assert(segno <= a_segno);
|
|
|
|
while (segno < a_segno)
|
|
|
|
{
|
|
|
|
oldseg = GinNextPostingListSegment(oldseg);
|
|
|
|
segno++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ADDITEMS action is handled like REPLACE, but the new segment to
|
|
|
|
* replace the old one is reconstructed using the old segment from
|
|
|
|
* disk and the new items from the WAL record.
|
|
|
|
*/
|
|
|
|
if (a_action == GIN_SEGMENT_ADDITEMS)
|
|
|
|
{
|
|
|
|
int npacked;
|
|
|
|
|
|
|
|
olditems = ginPostingListDecode(oldseg, &nolditems);
|
|
|
|
|
|
|
|
newitems = ginMergeItemPointers(items, nitems,
|
|
|
|
olditems, nolditems,
|
|
|
|
&nnewitems);
|
|
|
|
Assert(nnewitems == nolditems + nitems);
|
|
|
|
|
|
|
|
newseg = ginCompressPostingList(newitems, nnewitems,
|
|
|
|
BLCKSZ, &npacked);
|
|
|
|
Assert(npacked == nnewitems);
|
|
|
|
|
|
|
|
newsegsize = SizeOfGinPostingList(newseg);
|
|
|
|
a_action = GIN_SEGMENT_REPLACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
segptr = (Pointer) oldseg;
|
|
|
|
if (segptr != segmentend)
|
|
|
|
segsize = SizeOfGinPostingList(oldseg);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Positioned after the last existing segment. Only INSERTs
|
|
|
|
* expected here.
|
|
|
|
*/
|
|
|
|
Assert(a_action == GIN_SEGMENT_INSERT);
|
|
|
|
segsize = 0;
|
|
|
|
}
|
|
|
|
szleft = segmentend - segptr;
|
|
|
|
|
|
|
|
switch (a_action)
|
|
|
|
{
|
|
|
|
case GIN_SEGMENT_DELETE:
|
|
|
|
memmove(segptr, segptr + segsize, szleft - segsize);
|
|
|
|
segmentend -= segsize;
|
|
|
|
|
|
|
|
segno++;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case GIN_SEGMENT_INSERT:
|
|
|
|
/* make room for the new segment */
|
|
|
|
memmove(segptr + newsegsize, segptr, szleft);
|
|
|
|
/* copy the new segment in place */
|
|
|
|
memcpy(segptr, newseg, newsegsize);
|
|
|
|
segmentend += newsegsize;
|
|
|
|
segptr += newsegsize;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case GIN_SEGMENT_REPLACE:
|
|
|
|
/* shift the segments that follow */
|
|
|
|
memmove(segptr + newsegsize,
|
|
|
|
segptr + segsize,
|
|
|
|
szleft - segsize);
|
|
|
|
/* copy the replacement segment in place */
|
|
|
|
memcpy(segptr, newseg, newsegsize);
|
|
|
|
segmentend -= segsize;
|
|
|
|
segmentend += newsegsize;
|
|
|
|
segptr += newsegsize;
|
|
|
|
segno++;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
elog(ERROR, "unexpected GIN leaf action: %u", a_action);
|
|
|
|
}
|
|
|
|
oldseg = (GinPostingList *) segptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page);
|
2014-04-14 21:03:01 +03:00
|
|
|
GinDataPageSetDataSize(page, totalsize);
|
2014-01-22 18:51:48 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata)
|
2013-11-27 19:21:23 +02:00
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
if (isLeaf)
|
2013-11-27 19:21:23 +02:00
|
|
|
{
|
2014-01-22 18:51:48 +02:00
|
|
|
ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata;
|
2013-11-27 19:21:23 +02:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
Assert(GinPageIsLeaf(page));
|
|
|
|
|
|
|
|
ginRedoRecompress(page, data);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-01-22 18:51:48 +02:00
|
|
|
ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata;
|
2013-11-27 19:21:23 +02:00
|
|
|
PostingItem *oldpitem;
|
2010-10-11 19:04:37 -04:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
Assert(!GinPageIsLeaf(page));
|
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
/* update link to right page after split */
|
2014-01-22 18:51:48 +02:00
|
|
|
oldpitem = GinDataPageGetPostingItem(page, data->offset);
|
2013-11-27 19:21:23 +02:00
|
|
|
PostingItemSetBlockNumber(oldpitem, rightblkno);
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
GinDataPageAddPostingItem(page, &data->newitem, data->offset);
|
2013-11-27 19:21:23 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
|
|
|
ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record);
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
char *payload;
|
|
|
|
BlockNumber leftChildBlkno = InvalidBlockNumber;
|
|
|
|
BlockNumber rightChildBlkno = InvalidBlockNumber;
|
|
|
|
bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
|
|
|
|
|
|
|
|
payload = XLogRecGetData(record) + sizeof(ginxlogInsert);
|
|
|
|
|
|
|
|
/*
|
2014-05-06 12:12:18 -04:00
|
|
|
* First clear incomplete-split flag on child page if this finishes a
|
|
|
|
* split.
|
2013-11-27 19:21:23 +02:00
|
|
|
*/
|
|
|
|
if (!isLeaf)
|
|
|
|
{
|
|
|
|
leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload);
|
|
|
|
payload += sizeof(BlockIdData);
|
|
|
|
rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload);
|
|
|
|
payload += sizeof(BlockIdData);
|
|
|
|
|
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(0))
|
|
|
|
(void) RestoreBackupBlock(lsn, record, 0, false, false);
|
|
|
|
else
|
|
|
|
ginRedoClearIncompleteSplit(lsn, data->node, leftChildBlkno);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
/* If we have a full-page image, restore it and we're done */
|
2013-11-27 19:21:23 +02:00
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(isLeaf ? 0 : 1))
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
(void) RestoreBackupBlock(lsn, record, isLeaf ? 0 : 1, false, false);
|
2006-05-02 11:28:56 +00:00
|
|
|
return;
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2008-06-12 09:12:31 +00:00
|
|
|
buffer = XLogReadBuffer(data->node, data->blkno, false);
|
2010-10-11 19:04:37 -04:00
|
|
|
if (!BufferIsValid(buffer))
|
|
|
|
return; /* page was deleted, nothing to do */
|
2006-05-02 11:28:56 +00:00
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
2012-12-28 13:06:15 -03:00
|
|
|
if (lsn > PageGetLSN(page))
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
/* How to insert the payload is tree-type specific */
|
|
|
|
if (data->flags & GIN_INSERT_ISDATA)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2010-10-11 19:04:37 -04:00
|
|
|
Assert(GinPageIsData(page));
|
2014-01-22 18:51:48 +02:00
|
|
|
ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload);
|
2007-10-29 19:26:57 +00:00
|
|
|
}
|
2010-10-11 19:04:37 -04:00
|
|
|
else
|
2007-10-29 19:26:57 +00:00
|
|
|
{
|
2010-10-11 19:04:37 -04:00
|
|
|
Assert(!GinPageIsData(page));
|
2014-01-22 18:51:48 +02:00
|
|
|
ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload);
|
2013-11-27 19:21:23 +02:00
|
|
|
}
|
2007-10-29 19:26:57 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
static void
|
|
|
|
ginRedoSplitEntry(Page lpage, Page rpage, void *rdata)
|
|
|
|
{
|
|
|
|
ginxlogSplitEntry *data = (ginxlogSplitEntry *) rdata;
|
|
|
|
IndexTuple itup = (IndexTuple) ((char *) rdata + sizeof(ginxlogSplitEntry));
|
|
|
|
OffsetNumber i;
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
for (i = 0; i < data->separator; i++)
|
|
|
|
{
|
|
|
|
if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add item to gin index page");
|
|
|
|
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
for (i = data->separator; i < data->nitem; i++)
|
|
|
|
{
|
|
|
|
if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add item to gin index page");
|
|
|
|
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
|
|
|
|
}
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
static void
|
|
|
|
ginRedoSplitData(Page lpage, Page rpage, void *rdata)
|
|
|
|
{
|
|
|
|
bool isleaf = GinPageIsLeaf(lpage);
|
|
|
|
|
|
|
|
if (isleaf)
|
|
|
|
{
|
2014-01-22 18:51:48 +02:00
|
|
|
ginxlogSplitDataLeaf *data = (ginxlogSplitDataLeaf *) rdata;
|
|
|
|
Pointer lptr = (Pointer) rdata + sizeof(ginxlogSplitDataLeaf);
|
|
|
|
Pointer rptr = lptr + data->lsize;
|
|
|
|
|
2014-04-14 21:03:01 +03:00
|
|
|
Assert(data->lsize > 0 && data->lsize <= GinDataPageMaxDataSize);
|
|
|
|
Assert(data->rsize > 0 && data->rsize <= GinDataPageMaxDataSize);
|
2014-01-22 18:51:48 +02:00
|
|
|
|
|
|
|
memcpy(GinDataLeafPageGetPostingList(lpage), lptr, data->lsize);
|
|
|
|
memcpy(GinDataLeafPageGetPostingList(rpage), rptr, data->rsize);
|
|
|
|
|
2014-04-14 21:03:01 +03:00
|
|
|
GinDataPageSetDataSize(lpage, data->lsize);
|
|
|
|
GinDataPageSetDataSize(rpage, data->rsize);
|
2014-01-22 18:51:48 +02:00
|
|
|
*GinDataPageGetRightBound(lpage) = data->lrightbound;
|
|
|
|
*GinDataPageGetRightBound(rpage) = data->rrightbound;
|
2013-11-27 19:21:23 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2014-01-22 18:51:48 +02:00
|
|
|
ginxlogSplitDataInternal *data = (ginxlogSplitDataInternal *) rdata;
|
|
|
|
PostingItem *items = (PostingItem *) ((char *) rdata + sizeof(ginxlogSplitDataInternal));
|
|
|
|
OffsetNumber i;
|
|
|
|
OffsetNumber maxoff;
|
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
for (i = 0; i < data->separator; i++)
|
|
|
|
GinDataPageAddPostingItem(lpage, &items[i], InvalidOffsetNumber);
|
|
|
|
for (i = data->separator; i < data->nitem; i++)
|
|
|
|
GinDataPageAddPostingItem(rpage, &items[i], InvalidOffsetNumber);
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
/* set up right key */
|
|
|
|
maxoff = GinPageGetOpaque(lpage)->maxoff;
|
|
|
|
*GinDataPageGetRightBound(lpage) = GinDataPageGetPostingItem(lpage, maxoff)->key;
|
|
|
|
*GinDataPageGetRightBound(rpage) = data->rightbound;
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 00:30:14 +00:00
|
|
|
ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
|
|
|
ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record);
|
|
|
|
Buffer lbuffer,
|
|
|
|
rbuffer;
|
|
|
|
Page lpage,
|
|
|
|
rpage;
|
2014-04-07 14:34:31 +03:00
|
|
|
uint32 flags;
|
|
|
|
uint32 lflags,
|
|
|
|
rflags;
|
2013-11-27 19:21:23 +02:00
|
|
|
char *payload;
|
|
|
|
bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
|
|
|
|
bool isData = (data->flags & GIN_INSERT_ISDATA) != 0;
|
|
|
|
bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0;
|
|
|
|
|
|
|
|
payload = XLogRecGetData(record) + sizeof(ginxlogSplit);
|
|
|
|
|
|
|
|
/*
|
2014-05-06 12:12:18 -04:00
|
|
|
* First clear incomplete-split flag on child page if this finishes a
|
|
|
|
* split
|
2013-11-27 19:21:23 +02:00
|
|
|
*/
|
|
|
|
if (!isLeaf)
|
|
|
|
{
|
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(0))
|
|
|
|
(void) RestoreBackupBlock(lsn, record, 0, false, false);
|
|
|
|
else
|
|
|
|
ginRedoClearIncompleteSplit(lsn, data->node, data->leftChildBlkno);
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2014-04-07 14:34:31 +03:00
|
|
|
flags = 0;
|
2013-11-27 19:21:23 +02:00
|
|
|
if (isLeaf)
|
2006-05-02 11:28:56 +00:00
|
|
|
flags |= GIN_LEAF;
|
2013-11-27 19:21:23 +02:00
|
|
|
if (isData)
|
|
|
|
flags |= GIN_DATA;
|
2014-01-22 18:51:48 +02:00
|
|
|
if (isLeaf && isData)
|
|
|
|
flags |= GIN_COMPRESSED;
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
|
2014-04-07 14:34:31 +03:00
|
|
|
lflags = rflags = flags;
|
|
|
|
if (!isRoot)
|
|
|
|
lflags |= GIN_INCOMPLETE_SPLIT;
|
|
|
|
|
2010-10-11 19:04:37 -04:00
|
|
|
lbuffer = XLogReadBuffer(data->node, data->lblkno, true);
|
2006-05-02 11:28:56 +00:00
|
|
|
Assert(BufferIsValid(lbuffer));
|
|
|
|
lpage = (Page) BufferGetPage(lbuffer);
|
2014-04-07 14:34:31 +03:00
|
|
|
GinInitBuffer(lbuffer, lflags);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2008-06-12 09:12:31 +00:00
|
|
|
rbuffer = XLogReadBuffer(data->node, data->rblkno, true);
|
2006-05-02 11:28:56 +00:00
|
|
|
Assert(BufferIsValid(rbuffer));
|
|
|
|
rpage = (Page) BufferGetPage(rbuffer);
|
2014-04-07 14:34:31 +03:00
|
|
|
GinInitBuffer(rbuffer, rflags);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2006-10-04 00:30:14 +00:00
|
|
|
GinPageGetOpaque(lpage)->rightlink = BufferGetBlockNumber(rbuffer);
|
2013-11-27 19:21:23 +02:00
|
|
|
GinPageGetOpaque(rpage)->rightlink = isRoot ? InvalidBlockNumber : data->rrlink;
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
/* Do the tree-type specific portion to restore the page contents */
|
|
|
|
if (isData)
|
|
|
|
ginRedoSplitData(lpage, rpage, payload);
|
2006-10-04 00:30:14 +00:00
|
|
|
else
|
2013-11-27 19:21:23 +02:00
|
|
|
ginRedoSplitEntry(lpage, rpage, payload);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
|
|
|
PageSetLSN(rpage, lsn);
|
|
|
|
MarkBufferDirty(rbuffer);
|
|
|
|
|
|
|
|
PageSetLSN(lpage, lsn);
|
|
|
|
MarkBufferDirty(lbuffer);
|
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
if (isRoot)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2014-05-06 12:12:18 -04:00
|
|
|
BlockNumber rootBlkno = data->rrlink;
|
2013-11-27 19:21:23 +02:00
|
|
|
Buffer rootBuf = XLogReadBuffer(data->node, rootBlkno, true);
|
2006-10-04 00:30:14 +00:00
|
|
|
Page rootPage = BufferGetPage(rootBuf);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
GinInitBuffer(rootBuf, flags & ~GIN_LEAF & ~GIN_COMPRESSED);
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
if (isData)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
Assert(rootBlkno != GIN_ROOT_BLKNO);
|
|
|
|
ginDataFillRoot(NULL, BufferGetPage(rootBuf),
|
|
|
|
BufferGetBlockNumber(lbuffer),
|
|
|
|
BufferGetPage(lbuffer),
|
|
|
|
BufferGetBlockNumber(rbuffer),
|
|
|
|
BufferGetPage(rbuffer));
|
2006-10-04 00:30:14 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2013-11-27 19:21:23 +02:00
|
|
|
Assert(rootBlkno == GIN_ROOT_BLKNO);
|
|
|
|
ginEntryFillRoot(NULL, BufferGetPage(rootBuf),
|
|
|
|
BufferGetBlockNumber(lbuffer),
|
|
|
|
BufferGetPage(lbuffer),
|
|
|
|
BufferGetBlockNumber(rbuffer),
|
|
|
|
BufferGetPage(rbuffer));
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PageSetLSN(rootPage, lsn);
|
|
|
|
|
|
|
|
MarkBufferDirty(rootBuf);
|
|
|
|
UnlockReleaseBuffer(rootBuf);
|
2006-10-04 00:30:14 +00:00
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
|
|
|
UnlockReleaseBuffer(rbuffer);
|
|
|
|
UnlockReleaseBuffer(lbuffer);
|
|
|
|
}
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
/*
|
2014-07-31 16:48:43 +03:00
|
|
|
* VACUUM_PAGE record contains simply a full image of the page, similar to
|
|
|
|
* a XLOG_FPI record.
|
2014-01-22 18:51:48 +02:00
|
|
|
*/
|
2006-05-02 11:28:56 +00:00
|
|
|
static void
|
2006-10-04 00:30:14 +00:00
|
|
|
ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
2014-01-22 18:51:48 +02:00
|
|
|
ginxlogVacuumPage *xlrec = (ginxlogVacuumPage *) XLogRecGetData(record);
|
|
|
|
char *blk = ((char *) xlrec) + sizeof(ginxlogVacuumPage);
|
2006-05-02 11:28:56 +00:00
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
Assert(xlrec->hole_offset < BLCKSZ);
|
|
|
|
Assert(xlrec->hole_length < BLCKSZ);
|
|
|
|
|
2014-04-01 21:16:10 +03:00
|
|
|
/* Backup blocks are not used, we'll re-initialize the page always. */
|
|
|
|
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, true);
|
2010-10-11 19:04:37 -04:00
|
|
|
if (!BufferIsValid(buffer))
|
|
|
|
return;
|
2006-05-02 11:28:56 +00:00
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
if (xlrec->hole_length == 0)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
2014-01-22 18:51:48 +02:00
|
|
|
memcpy((char *) page, blk, BLCKSZ);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
memcpy((char *) page, blk, xlrec->hole_offset);
|
|
|
|
/* must zero-fill the hole */
|
|
|
|
MemSet((char *) page + xlrec->hole_offset, 0, xlrec->hole_length);
|
|
|
|
memcpy((char *) page + (xlrec->hole_offset + xlrec->hole_length),
|
|
|
|
blk + xlrec->hole_offset,
|
|
|
|
BLCKSZ - (xlrec->hole_offset + xlrec->hole_length));
|
|
|
|
}
|
2006-10-04 00:30:14 +00:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
PageSetLSN(page, lsn);
|
2006-10-04 00:30:14 +00:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
2006-10-04 00:30:14 +00:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
static void
|
|
|
|
ginRedoVacuumDataLeafPage(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
|
|
|
ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetData(record);
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
|
|
|
|
/* If we have a full-page image, restore it and we're done */
|
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(0))
|
|
|
|
{
|
|
|
|
(void) RestoreBackupBlock(lsn, record, 0, false, false);
|
|
|
|
return;
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
2014-01-22 18:51:48 +02:00
|
|
|
buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, false);
|
|
|
|
if (!BufferIsValid(buffer))
|
|
|
|
return;
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
|
|
|
Assert(GinPageIsLeaf(page));
|
|
|
|
Assert(GinPageIsData(page));
|
|
|
|
|
|
|
|
if (lsn > PageGetLSN(page))
|
|
|
|
{
|
|
|
|
ginRedoRecompress(page, &xlrec->data);
|
2010-10-11 19:04:37 -04:00
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2006-10-04 00:30:14 +00:00
|
|
|
ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
|
|
|
ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record);
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
Buffer dbuffer;
|
|
|
|
Buffer pbuffer;
|
|
|
|
Buffer lbuffer;
|
2006-05-02 11:28:56 +00:00
|
|
|
Page page;
|
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(0))
|
|
|
|
dbuffer = RestoreBackupBlock(lsn, record, 0, false, true);
|
|
|
|
else
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
dbuffer = XLogReadBuffer(data->node, data->blkno, false);
|
|
|
|
if (BufferIsValid(dbuffer))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
page = BufferGetPage(dbuffer);
|
2012-12-28 13:06:15 -03:00
|
|
|
if (lsn > PageGetLSN(page))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
|
|
|
Assert(GinPageIsData(page));
|
|
|
|
GinPageGetOpaque(page)->flags = GIN_DELETED;
|
|
|
|
PageSetLSN(page, lsn);
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
MarkBufferDirty(dbuffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(1))
|
|
|
|
pbuffer = RestoreBackupBlock(lsn, record, 1, false, true);
|
|
|
|
else
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
pbuffer = XLogReadBuffer(data->node, data->parentBlkno, false);
|
|
|
|
if (BufferIsValid(pbuffer))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
page = BufferGetPage(pbuffer);
|
2012-12-28 13:06:15 -03:00
|
|
|
if (lsn > PageGetLSN(page))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
|
|
|
Assert(GinPageIsData(page));
|
|
|
|
Assert(!GinPageIsLeaf(page));
|
2010-10-17 21:43:26 -04:00
|
|
|
GinPageDeletePostingItem(page, data->parentOffset);
|
2010-10-11 19:04:37 -04:00
|
|
|
PageSetLSN(page, lsn);
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
MarkBufferDirty(pbuffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(2))
|
|
|
|
(void) RestoreBackupBlock(lsn, record, 2, false, false);
|
|
|
|
else if (data->leftBlkno != InvalidBlockNumber)
|
2006-10-04 00:30:14 +00:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
lbuffer = XLogReadBuffer(data->node, data->leftBlkno, false);
|
|
|
|
if (BufferIsValid(lbuffer))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
page = BufferGetPage(lbuffer);
|
2012-12-28 13:06:15 -03:00
|
|
|
if (lsn > PageGetLSN(page))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
|
|
|
Assert(GinPageIsData(page));
|
|
|
|
GinPageGetOpaque(page)->rightlink = data->rightLink;
|
|
|
|
PageSetLSN(page, lsn);
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
MarkBufferDirty(lbuffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
UnlockReleaseBuffer(lbuffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
|
|
|
|
if (BufferIsValid(pbuffer))
|
|
|
|
UnlockReleaseBuffer(pbuffer);
|
|
|
|
if (BufferIsValid(dbuffer))
|
|
|
|
UnlockReleaseBuffer(dbuffer);
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
2009-03-24 20:17:18 +00:00
|
|
|
static void
|
|
|
|
ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
2009-06-11 14:49:15 +00:00
|
|
|
ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record);
|
2009-03-24 20:17:18 +00:00
|
|
|
Buffer metabuffer;
|
|
|
|
Page metapage;
|
2010-10-11 19:04:37 -04:00
|
|
|
Buffer buffer;
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2014-03-12 09:59:49 +02:00
|
|
|
/*
|
2014-05-06 12:12:18 -04:00
|
|
|
* Restore the metapage. This is essentially the same as a full-page
|
|
|
|
* image, so restore the metapage unconditionally without looking at the
|
|
|
|
* LSN, to avoid torn page hazards.
|
2014-03-12 09:59:49 +02:00
|
|
|
*/
|
2009-03-24 20:17:18 +00:00
|
|
|
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
|
2010-10-11 19:04:37 -04:00
|
|
|
if (!BufferIsValid(metabuffer))
|
2012-02-26 15:12:17 -05:00
|
|
|
return; /* assume index was deleted, nothing to do */
|
2009-03-24 20:17:18 +00:00
|
|
|
metapage = BufferGetPage(metabuffer);
|
|
|
|
|
2014-03-12 09:59:49 +02:00
|
|
|
memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
|
|
|
|
PageSetLSN(metapage, lsn);
|
|
|
|
MarkBufferDirty(metabuffer);
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2009-06-11 14:49:15 +00:00
|
|
|
if (data->ntuples > 0)
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* insert into tail page
|
|
|
|
*/
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(0))
|
|
|
|
(void) RestoreBackupBlock(lsn, record, 0, false, false);
|
|
|
|
else
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
2010-10-11 19:04:37 -04:00
|
|
|
buffer = XLogReadBuffer(data->node, data->metadata.tail, false);
|
|
|
|
if (BufferIsValid(buffer))
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
2010-10-11 19:04:37 -04:00
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
|
2012-12-28 13:06:15 -03:00
|
|
|
if (lsn > PageGetLSN(page))
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
2010-10-11 19:04:37 -04:00
|
|
|
OffsetNumber l,
|
2011-04-10 11:42:00 -04:00
|
|
|
off = (PageIsEmpty(page)) ? FirstOffsetNumber :
|
|
|
|
OffsetNumberNext(PageGetMaxOffsetNumber(page));
|
2010-10-11 19:04:37 -04:00
|
|
|
int i,
|
2011-04-10 11:42:00 -04:00
|
|
|
tupsize;
|
2010-10-11 19:04:37 -04:00
|
|
|
IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta));
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2010-10-11 19:04:37 -04:00
|
|
|
for (i = 0; i < data->ntuples; i++)
|
|
|
|
{
|
|
|
|
tupsize = IndexTupleSize(tuples);
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2010-10-11 19:04:37 -04:00
|
|
|
l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2010-10-11 19:04:37 -04:00
|
|
|
if (l == InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add item to index page");
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2010-10-11 19:04:37 -04:00
|
|
|
tuples = (IndexTuple) (((char *) tuples) + tupsize);
|
2011-11-25 13:58:59 -05:00
|
|
|
|
|
|
|
off++;
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2010-10-11 19:04:37 -04:00
|
|
|
/*
|
|
|
|
* Increase counter of heap tuples
|
|
|
|
*/
|
|
|
|
GinPageGetOpaque(page)->maxoff++;
|
|
|
|
|
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
}
|
|
|
|
UnlockReleaseBuffer(buffer);
|
2009-03-24 20:17:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-06-11 14:49:15 +00:00
|
|
|
else if (data->prevTail != InvalidBlockNumber)
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* New tail
|
|
|
|
*/
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(0))
|
|
|
|
(void) RestoreBackupBlock(lsn, record, 0, false, false);
|
|
|
|
else
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
buffer = XLogReadBuffer(data->node, data->prevTail, false);
|
|
|
|
if (BufferIsValid(buffer))
|
2010-10-11 19:04:37 -04:00
|
|
|
{
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
Page page = BufferGetPage(buffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
|
2012-12-28 13:06:15 -03:00
|
|
|
if (lsn > PageGetLSN(page))
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
{
|
|
|
|
GinPageGetOpaque(page)->rightlink = data->newRightlink;
|
|
|
|
|
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
}
|
|
|
|
UnlockReleaseBuffer(buffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
}
|
2009-03-24 20:17:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UnlockReleaseBuffer(metabuffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
2009-06-11 14:49:15 +00:00
|
|
|
ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record);
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
OffsetNumber l,
|
|
|
|
off = FirstOffsetNumber;
|
|
|
|
int i,
|
|
|
|
tupsize;
|
|
|
|
IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsertListPage));
|
2009-03-24 20:17:18 +00:00
|
|
|
|
Fix two bugs in WAL-logging of GIN pending-list pages.
In writeListPage, never take a full-page image of the page, because we
have all the information required to re-initialize in the WAL record
anyway. Before this fix, a full-page image was always generated, unless
full_page_writes=off, because when the page is initialized its LSN is
always 0. In stable-branches, keep the code to restore the backup blocks
if they exist, in case that the WAL is generated with an older minor
version, but in master Assert that there are no full-page images.
In the redo routine, add missing "off++". Otherwise the tuples are added
to the page in reverse order. That happens to be harmless because we
always scan and remove all the tuples together, but it was clearly wrong.
Also, it was masked by the first bug unless full_page_writes=off, because
the page was always restored from a full-page image.
Backpatch to all supported versions.
2014-04-28 16:12:45 +03:00
|
|
|
/*
|
|
|
|
* Backup blocks are not used, we always re-initialize the page.
|
|
|
|
*/
|
|
|
|
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
|
2009-03-24 20:17:18 +00:00
|
|
|
|
|
|
|
buffer = XLogReadBuffer(data->node, data->blkno, true);
|
2010-10-11 19:04:37 -04:00
|
|
|
Assert(BufferIsValid(buffer));
|
2009-03-24 20:17:18 +00:00
|
|
|
page = BufferGetPage(buffer);
|
|
|
|
|
|
|
|
GinInitBuffer(buffer, GIN_LIST);
|
|
|
|
GinPageGetOpaque(page)->rightlink = data->rightlink;
|
2009-06-11 14:49:15 +00:00
|
|
|
if (data->rightlink == InvalidBlockNumber)
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
|
|
|
/* tail of sublist */
|
|
|
|
GinPageSetFullRow(page);
|
|
|
|
GinPageGetOpaque(page)->maxoff = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
GinPageGetOpaque(page)->maxoff = 0;
|
|
|
|
}
|
|
|
|
|
2009-06-11 14:49:15 +00:00
|
|
|
for (i = 0; i < data->ntuples; i++)
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
|
|
|
tupsize = IndexTupleSize(tuples);
|
|
|
|
|
2009-06-11 14:49:15 +00:00
|
|
|
l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
|
2009-03-24 20:17:18 +00:00
|
|
|
|
|
|
|
if (l == InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add item to index page");
|
|
|
|
|
2009-06-11 14:49:15 +00:00
|
|
|
tuples = (IndexTuple) (((char *) tuples) + tupsize);
|
Fix two bugs in WAL-logging of GIN pending-list pages.
In writeListPage, never take a full-page image of the page, because we
have all the information required to re-initialize in the WAL record
anyway. Before this fix, a full-page image was always generated, unless
full_page_writes=off, because when the page is initialized its LSN is
always 0. In stable-branches, keep the code to restore the backup blocks
if they exist, in case that the WAL is generated with an older minor
version, but in master Assert that there are no full-page images.
In the redo routine, add missing "off++". Otherwise the tuples are added
to the page in reverse order. That happens to be harmless because we
always scan and remove all the tuples together, but it was clearly wrong.
Also, it was masked by the first bug unless full_page_writes=off, because
the page was always restored from a full-page image.
Backpatch to all supported versions.
2014-04-28 16:12:45 +03:00
|
|
|
off++;
|
2009-03-24 20:17:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
2009-06-11 14:49:15 +00:00
|
|
|
ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record);
|
2009-03-24 20:17:18 +00:00
|
|
|
Buffer metabuffer;
|
|
|
|
Page metapage;
|
2009-06-11 14:49:15 +00:00
|
|
|
int i;
|
2009-03-24 20:17:18 +00:00
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
/* Backup blocks are not used in delete_listpage records */
|
|
|
|
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
|
|
|
|
|
2009-03-24 20:17:18 +00:00
|
|
|
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
|
2010-10-11 19:04:37 -04:00
|
|
|
if (!BufferIsValid(metabuffer))
|
2012-02-26 15:12:17 -05:00
|
|
|
return; /* assume index was deleted, nothing to do */
|
2009-03-24 20:17:18 +00:00
|
|
|
metapage = BufferGetPage(metabuffer);
|
|
|
|
|
2014-03-12 09:59:49 +02:00
|
|
|
memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
|
|
|
|
PageSetLSN(metapage, lsn);
|
|
|
|
MarkBufferDirty(metabuffer);
|
2009-03-24 20:17:18 +00:00
|
|
|
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
/*
|
|
|
|
* In normal operation, shiftList() takes exclusive lock on all the
|
2014-05-06 12:12:18 -04:00
|
|
|
* pages-to-be-deleted simultaneously. During replay, however, it should
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
* be all right to lock them one at a time. This is dependent on the fact
|
|
|
|
* that we are deleting pages from the head of the list, and that readers
|
|
|
|
* share-lock the next page before releasing the one they are on. So we
|
|
|
|
* cannot get past a reader that is on, or due to visit, any page we are
|
|
|
|
* going to delete. New incoming readers will block behind our metapage
|
|
|
|
* lock and then see a fully updated page list.
|
2014-05-08 14:43:04 +03:00
|
|
|
*
|
|
|
|
* No full-page images are taken of the deleted pages. Instead, they are
|
|
|
|
* re-initialized as empty, deleted pages. Their right-links don't need to
|
|
|
|
* be preserved, because no new readers can see the pages, as explained
|
|
|
|
* above.
|
Fix multiple problems in WAL replay.
Most of the replay functions for WAL record types that modify more than
one page failed to ensure that those pages were locked correctly to ensure
that concurrent queries could not see inconsistent page states. This is
a hangover from coding decisions made long before Hot Standby was added,
when it was hardly necessary to acquire buffer locks during WAL replay
at all, let alone hold them for carefully-chosen periods.
The key problem was that RestoreBkpBlocks was written to hold lock on each
page restored from a full-page image for only as long as it took to update
that page. This was guaranteed to break any WAL replay function in which
there was any update-ordering constraint between pages, because even if the
nominal order of the pages is the right one, any mixture of full-page and
non-full-page updates in the same record would result in out-of-order
updates. Moreover, it wouldn't work for situations where there's a
requirement to maintain lock on one page while updating another. Failure
to honor an update ordering constraint in this way is thought to be the
cause of bug #7648 from Daniel Farina: what seems to have happened there
is that a btree page being split was rewritten from a full-page image
before the new right sibling page was written, and because lock on the
original page was not maintained it was possible for hot standby queries to
try to traverse the page's right-link to the not-yet-existing sibling page.
To fix, get rid of RestoreBkpBlocks as such, and instead create a new
function RestoreBackupBlock that restores just one full-page image at a
time. This function can be invoked by WAL replay functions at the points
where they would otherwise perform non-full-page updates; in this way, the
physical order of page updates remains the same no matter which pages are
replaced by full-page images. We can then further adjust the logic in
individual replay functions if it is necessary to hold buffer locks
for overlapping periods. A side benefit is that we can simplify the
handling of concurrency conflict resolution by moving that code into the
record-type-specfic functions; there's no more need to contort the code
layout to keep conflict resolution in front of the RestoreBkpBlocks call.
In connection with that, standardize on zero-based numbering rather than
one-based numbering for referencing the full-page images. In HEAD, I
removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are
still there in the header files in previous branches, but are no longer
used by the code.
In addition, fix some other bugs identified in the course of making these
changes:
spgRedoAddNode could fail to update the parent downlink at all, if the
parent tuple is in the same page as either the old or new split tuple and
we're not doing a full-page image: it would get fooled by the LSN having
been advanced already. This would result in permanent index corruption,
not just transient failure of concurrent queries.
Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old
tail page as a candidate for a full-page image; in the worst case this
could result in torn-page corruption.
heap_xlog_freeze() was inconsistent about using a cleanup lock or plain
exclusive lock: it did the former in the normal path but the latter for a
full-page image. A plain exclusive lock seems sufficient, so change to
that.
Also, remove gistRedoPageDeleteRecord(), which has been dead code since
VACUUM FULL was rewritten.
Back-patch to 9.0, where hot standby was introduced. Note however that 9.0
had a significantly different WAL-logging scheme for GIST index updates,
and it doesn't appear possible to make that scheme safe for concurrent hot
standby queries, because it can leave inconsistent states in the index even
between WAL records. Given the lack of complaints from the field, we won't
work too hard on fixing that branch.
2012-11-12 22:05:08 -05:00
|
|
|
*/
|
2009-06-11 14:49:15 +00:00
|
|
|
for (i = 0; i < data->ndeleted; i++)
|
2009-03-24 20:17:18 +00:00
|
|
|
{
|
2014-05-08 14:43:04 +03:00
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2014-05-08 14:43:04 +03:00
|
|
|
buffer = XLogReadBuffer(data->node, data->toDelete[i], true);
|
|
|
|
page = BufferGetPage(buffer);
|
|
|
|
GinInitBuffer(buffer, GIN_DELETED);
|
2009-03-24 20:17:18 +00:00
|
|
|
|
2014-05-08 14:43:04 +03:00
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
MarkBufferDirty(buffer);
|
2010-10-11 19:04:37 -04:00
|
|
|
|
2014-05-08 14:43:04 +03:00
|
|
|
UnlockReleaseBuffer(buffer);
|
2009-03-24 20:17:18 +00:00
|
|
|
}
|
|
|
|
UnlockReleaseBuffer(metabuffer);
|
|
|
|
}
|
|
|
|
|
2006-10-04 00:30:14 +00:00
|
|
|
void
|
|
|
|
gin_redo(XLogRecPtr lsn, XLogRecord *record)
|
|
|
|
{
|
|
|
|
uint8 info = record->xl_info & ~XLR_INFO_MASK;
|
2013-11-27 19:21:23 +02:00
|
|
|
MemoryContext oldCtx;
|
2006-05-02 11:28:56 +00:00
|
|
|
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 01:32:45 +00:00
|
|
|
/*
|
2012-02-04 22:37:34 -05:00
|
|
|
* GIN indexes do not require any conflict processing. NB: If we ever
|
|
|
|
* implement a similar optimization as we have in b-tree, and remove
|
|
|
|
* killed tuples outside VACUUM, we'll need to handle that here.
|
Allow read only connections during recovery, known as Hot Standby.
Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record.
New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far.
This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required.
Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit.
Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 01:32:45 +00:00
|
|
|
*/
|
2009-01-20 18:59:37 +00:00
|
|
|
|
2013-11-27 19:21:23 +02:00
|
|
|
oldCtx = MemoryContextSwitchTo(opCtx);
|
2006-10-04 00:30:14 +00:00
|
|
|
switch (info)
|
|
|
|
{
|
|
|
|
case XLOG_GIN_CREATE_INDEX:
|
2006-05-02 11:28:56 +00:00
|
|
|
ginRedoCreateIndex(lsn, record);
|
|
|
|
break;
|
2006-10-04 00:30:14 +00:00
|
|
|
case XLOG_GIN_CREATE_PTREE:
|
2006-05-02 11:28:56 +00:00
|
|
|
ginRedoCreatePTree(lsn, record);
|
|
|
|
break;
|
2006-10-04 00:30:14 +00:00
|
|
|
case XLOG_GIN_INSERT:
|
2006-05-02 11:28:56 +00:00
|
|
|
ginRedoInsert(lsn, record);
|
|
|
|
break;
|
2006-10-04 00:30:14 +00:00
|
|
|
case XLOG_GIN_SPLIT:
|
2006-05-02 11:28:56 +00:00
|
|
|
ginRedoSplit(lsn, record);
|
|
|
|
break;
|
2006-10-04 00:30:14 +00:00
|
|
|
case XLOG_GIN_VACUUM_PAGE:
|
2006-05-02 11:28:56 +00:00
|
|
|
ginRedoVacuumPage(lsn, record);
|
|
|
|
break;
|
2014-01-22 18:51:48 +02:00
|
|
|
case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
|
|
|
|
ginRedoVacuumDataLeafPage(lsn, record);
|
|
|
|
break;
|
2006-10-04 00:30:14 +00:00
|
|
|
case XLOG_GIN_DELETE_PAGE:
|
2006-05-02 11:28:56 +00:00
|
|
|
ginRedoDeletePage(lsn, record);
|
|
|
|
break;
|
2009-03-24 20:17:18 +00:00
|
|
|
case XLOG_GIN_UPDATE_META_PAGE:
|
|
|
|
ginRedoUpdateMetapage(lsn, record);
|
|
|
|
break;
|
|
|
|
case XLOG_GIN_INSERT_LISTPAGE:
|
|
|
|
ginRedoInsertListPage(lsn, record);
|
|
|
|
break;
|
2009-06-11 14:49:15 +00:00
|
|
|
case XLOG_GIN_DELETE_LISTPAGE:
|
2009-03-24 20:17:18 +00:00
|
|
|
ginRedoDeleteListPages(lsn, record);
|
|
|
|
break;
|
2006-05-02 11:28:56 +00:00
|
|
|
default:
|
|
|
|
elog(PANIC, "gin_redo: unknown op code %u", info);
|
|
|
|
}
|
2013-11-27 19:21:23 +02:00
|
|
|
MemoryContextSwitchTo(oldCtx);
|
2006-05-02 11:28:56 +00:00
|
|
|
MemoryContextReset(opCtx);
|
|
|
|
}
|
|
|
|
|
2006-10-04 00:30:14 +00:00
|
|
|
void
|
|
|
|
gin_xlog_startup(void)
|
|
|
|
{
|
2006-05-02 11:28:56 +00:00
|
|
|
opCtx = AllocSetContextCreate(CurrentMemoryContext,
|
2006-10-04 00:30:14 +00:00
|
|
|
"GIN recovery temporary context",
|
|
|
|
ALLOCSET_DEFAULT_MINSIZE,
|
|
|
|
ALLOCSET_DEFAULT_INITSIZE,
|
|
|
|
ALLOCSET_DEFAULT_MAXSIZE);
|
2006-05-02 11:28:56 +00:00
|
|
|
}
|
|
|
|
|
2006-10-04 00:30:14 +00:00
|
|
|
void
|
|
|
|
gin_xlog_cleanup(void)
|
|
|
|
{
|
2006-05-02 11:28:56 +00:00
|
|
|
MemoryContextDelete(opCtx);
|
2006-08-07 16:57:57 +00:00
|
|
|
}
|