885 lines
23 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* ginxlog.c
* WAL replay logic for inverted index.
*
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/access/gin/ginxlog.c
*-------------------------------------------------------------------------
*/
#include "postgres.h"
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
#include "access/gin_private.h"
#include "access/xlogutils.h"
#include "utils/memutils.h"
2006-10-04 00:30:14 +00:00
static MemoryContext opCtx; /* working memory for operations */
static MemoryContext topCtx;
2006-10-04 00:30:14 +00:00
typedef struct ginIncompleteSplit
{
RelFileNode node;
BlockNumber leftBlkno;
BlockNumber rightBlkno;
BlockNumber rootBlkno;
} ginIncompleteSplit;
static List *incomplete_splits;
static void
2006-10-04 00:30:14 +00:00
pushIncompleteSplit(RelFileNode node, BlockNumber leftBlkno, BlockNumber rightBlkno, BlockNumber rootBlkno)
{
ginIncompleteSplit *split;
2006-10-04 00:30:14 +00:00
MemoryContextSwitchTo(topCtx);
split = palloc(sizeof(ginIncompleteSplit));
split->node = node;
split->leftBlkno = leftBlkno;
split->rightBlkno = rightBlkno;
split->rootBlkno = rootBlkno;
incomplete_splits = lappend(incomplete_splits, split);
2006-10-04 00:30:14 +00:00
MemoryContextSwitchTo(opCtx);
}
static void
2006-10-04 00:30:14 +00:00
forgetIncompleteSplit(RelFileNode node, BlockNumber leftBlkno, BlockNumber updateBlkno)
{
ListCell *l;
2006-10-04 00:30:14 +00:00
foreach(l, incomplete_splits)
{
ginIncompleteSplit *split = (ginIncompleteSplit *) lfirst(l);
2006-10-04 00:30:14 +00:00
if (RelFileNodeEquals(node, split->node) && leftBlkno == split->leftBlkno && updateBlkno == split->rightBlkno)
{
incomplete_splits = list_delete_ptr(incomplete_splits, split);
break;
}
}
}
static void
2006-10-04 00:30:14 +00:00
ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
{
RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
Buffer RootBuffer,
MetaBuffer;
Page page;
MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true);
Assert(BufferIsValid(MetaBuffer));
page = (Page) BufferGetPage(MetaBuffer);
GinInitMetabuffer(MetaBuffer);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(MetaBuffer);
RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true);
Assert(BufferIsValid(RootBuffer));
page = (Page) BufferGetPage(RootBuffer);
GinInitBuffer(RootBuffer, GIN_LEAF);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(RootBuffer);
UnlockReleaseBuffer(RootBuffer);
UnlockReleaseBuffer(MetaBuffer);
}
static void
2006-10-04 00:30:14 +00:00
ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record);
ItemPointerData *items = (ItemPointerData *) (XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree));
Buffer buffer;
Page page;
buffer = XLogReadBuffer(data->node, data->blkno, true);
Assert(BufferIsValid(buffer));
page = (Page) BufferGetPage(buffer);
2006-10-04 00:30:14 +00:00
GinInitBuffer(buffer, GIN_DATA | GIN_LEAF);
memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * data->nitem);
GinPageGetOpaque(page)->maxoff = data->nitem;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
static void
2006-10-04 00:30:14 +00:00
ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record);
Buffer buffer;
Page page;
/* first, forget any incomplete split this insertion completes */
if (data->isData)
{
Assert(data->isDelete == FALSE);
if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
{
PostingItem *pitem;
pitem = (PostingItem *) (XLogRecGetData(record) + sizeof(ginxlogInsert));
forgetIncompleteSplit(data->node,
PostingItemGetBlockNumber(pitem),
data->updateBlkno);
}
}
else
{
if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
{
IndexTuple itup;
itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
forgetIncompleteSplit(data->node,
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
GinGetDownlink(itup),
data->updateBlkno);
}
}
/* nothing else to do if page was backed up */
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
buffer = XLogReadBuffer(data->node, data->blkno, false);
if (!BufferIsValid(buffer))
return; /* page was deleted, nothing to do */
page = (Page) BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
2006-10-04 00:30:14 +00:00
{
if (data->isData)
2006-10-04 00:30:14 +00:00
{
Assert(GinPageIsData(page));
if (data->isLeaf)
{
OffsetNumber i;
ItemPointerData *items = (ItemPointerData *) (XLogRecGetData(record) + sizeof(ginxlogInsert));
Assert(GinPageIsLeaf(page));
Assert(data->updateBlkno == InvalidBlockNumber);
for (i = 0; i < data->nitem; i++)
GinDataPageAddItem(page, items + i, data->offset + i);
}
else
{
PostingItem *pitem;
Assert(!GinPageIsLeaf(page));
if (data->updateBlkno != InvalidBlockNumber)
{
/* update link to right page after split */
pitem = (PostingItem *) GinDataPageGetItem(page, data->offset);
PostingItemSetBlockNumber(pitem, data->updateBlkno);
}
pitem = (PostingItem *) (XLogRecGetData(record) + sizeof(ginxlogInsert));
GinDataPageAddItem(page, pitem, data->offset);
}
}
else
{
IndexTuple itup;
2007-11-15 21:14:46 +00:00
Assert(!GinPageIsData(page));
if (data->updateBlkno != InvalidBlockNumber)
{
/* update link to right page after split */
Assert(!GinPageIsLeaf(page));
Assert(data->offset >= FirstOffsetNumber && data->offset <= PageGetMaxOffsetNumber(page));
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, data->offset));
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
GinSetDownlink(itup, data->updateBlkno);
}
if (data->isDelete)
{
Assert(GinPageIsLeaf(page));
Assert(data->offset >= FirstOffsetNumber && data->offset <= PageGetMaxOffsetNumber(page));
PageIndexTupleDelete(page, data->offset);
}
itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in %u/%u/%u",
2007-11-15 21:14:46 +00:00
data->node.spcNode, data->node.dbNode, data->node.relNode);
}
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
static void
2006-10-04 00:30:14 +00:00
ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogSplit *data = (ginxlogSplit *) XLogRecGetData(record);
Buffer lbuffer,
rbuffer;
Page lpage,
rpage;
uint32 flags = 0;
2006-10-04 00:30:14 +00:00
if (data->isLeaf)
flags |= GIN_LEAF;
2006-10-04 00:30:14 +00:00
if (data->isData)
flags |= GIN_DATA;
lbuffer = XLogReadBuffer(data->node, data->lblkno, true);
Assert(BufferIsValid(lbuffer));
lpage = (Page) BufferGetPage(lbuffer);
GinInitBuffer(lbuffer, flags);
rbuffer = XLogReadBuffer(data->node, data->rblkno, true);
Assert(BufferIsValid(rbuffer));
rpage = (Page) BufferGetPage(rbuffer);
GinInitBuffer(rbuffer, flags);
2006-10-04 00:30:14 +00:00
GinPageGetOpaque(lpage)->rightlink = BufferGetBlockNumber(rbuffer);
GinPageGetOpaque(rpage)->rightlink = data->rrlink;
2006-10-04 00:30:14 +00:00
if (data->isData)
{
char *ptr = XLogRecGetData(record) + sizeof(ginxlogSplit);
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
Size sizeofitem = GinSizeOfDataPageItem(lpage);
OffsetNumber i;
2006-10-04 00:30:14 +00:00
ItemPointer bound;
2006-10-04 00:30:14 +00:00
for (i = 0; i < data->separator; i++)
{
GinDataPageAddItem(lpage, ptr, InvalidOffsetNumber);
ptr += sizeofitem;
}
2006-10-04 00:30:14 +00:00
for (i = data->separator; i < data->nitem; i++)
{
GinDataPageAddItem(rpage, ptr, InvalidOffsetNumber);
ptr += sizeofitem;
}
/* set up right key */
bound = GinDataPageGetRightBound(lpage);
2006-10-04 00:30:14 +00:00
if (data->isLeaf)
*bound = *(ItemPointerData *) GinDataPageGetItem(lpage, GinPageGetOpaque(lpage)->maxoff);
else
2006-10-04 00:30:14 +00:00
*bound = ((PostingItem *) GinDataPageGetItem(lpage, GinPageGetOpaque(lpage)->maxoff))->key;
bound = GinDataPageGetRightBound(rpage);
*bound = data->rightbound;
2006-10-04 00:30:14 +00:00
}
else
{
IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogSplit));
OffsetNumber i;
2006-10-04 00:30:14 +00:00
for (i = 0; i < data->separator; i++)
{
if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
2006-10-04 00:30:14 +00:00
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
}
2006-10-04 00:30:14 +00:00
for (i = data->separator; i < data->nitem; i++)
{
if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
2006-10-04 00:30:14 +00:00
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
}
}
PageSetLSN(rpage, lsn);
2006-05-03 06:56:47 +00:00
PageSetTLI(rpage, ThisTimeLineID);
MarkBufferDirty(rbuffer);
PageSetLSN(lpage, lsn);
PageSetTLI(lpage, ThisTimeLineID);
MarkBufferDirty(lbuffer);
2006-10-04 00:30:14 +00:00
if (!data->isLeaf && data->updateBlkno != InvalidBlockNumber)
forgetIncompleteSplit(data->node, data->leftChildBlkno, data->updateBlkno);
2006-10-04 00:30:14 +00:00
if (data->isRootSplit)
{
Buffer rootBuf = XLogReadBuffer(data->node, data->rootBlkno, true);
2006-10-04 00:30:14 +00:00
Page rootPage = BufferGetPage(rootBuf);
2006-10-04 00:30:14 +00:00
GinInitBuffer(rootBuf, flags & ~GIN_LEAF);
2006-10-04 00:30:14 +00:00
if (data->isData)
{
Assert(data->rootBlkno != GIN_ROOT_BLKNO);
ginDataFillRoot(NULL, rootBuf, lbuffer, rbuffer);
2006-10-04 00:30:14 +00:00
}
else
{
Assert(data->rootBlkno == GIN_ROOT_BLKNO);
ginEntryFillRoot(NULL, rootBuf, lbuffer, rbuffer);
}
PageSetLSN(rootPage, lsn);
PageSetTLI(rootPage, ThisTimeLineID);
MarkBufferDirty(rootBuf);
UnlockReleaseBuffer(rootBuf);
2006-10-04 00:30:14 +00:00
}
else
pushIncompleteSplit(data->node, data->lblkno, data->rblkno, data->rootBlkno);
UnlockReleaseBuffer(rbuffer);
UnlockReleaseBuffer(lbuffer);
}
static void
2006-10-04 00:30:14 +00:00
ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogVacuumPage *data = (ginxlogVacuumPage *) XLogRecGetData(record);
Buffer buffer;
Page page;
/* nothing to do if page was backed up (and no info to do it with) */
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
buffer = XLogReadBuffer(data->node, data->blkno, false);
if (!BufferIsValid(buffer))
return;
page = (Page) BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
2006-10-04 00:30:14 +00:00
{
if (GinPageIsData(page))
{
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
memcpy(GinDataPageGetData(page),
XLogRecGetData(record) + sizeof(ginxlogVacuumPage),
data->nitem * GinSizeOfDataPageItem(page));
GinPageGetOpaque(page)->maxoff = data->nitem;
}
else
{
OffsetNumber i,
2011-04-10 11:42:00 -04:00
*tod;
IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage));
2006-10-04 00:30:14 +00:00
tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page));
for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++)
tod[i - 1] = i;
2006-10-04 00:30:14 +00:00
PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page));
2006-10-04 00:30:14 +00:00
for (i = 0; i < data->nitem; i++)
{
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
}
}
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
static void
2006-10-04 00:30:14 +00:00
ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogDeletePage *data = (ginxlogDeletePage *) XLogRecGetData(record);
Buffer buffer;
Page page;
2006-10-04 00:30:14 +00:00
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
buffer = XLogReadBuffer(data->node, data->blkno, false);
if (BufferIsValid(buffer))
{
page = BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
Assert(GinPageIsData(page));
GinPageGetOpaque(page)->flags = GIN_DELETED;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
}
2006-10-04 00:30:14 +00:00
if (!(record->xl_info & XLR_BKP_BLOCK_2))
{
buffer = XLogReadBuffer(data->node, data->parentBlkno, false);
if (BufferIsValid(buffer))
{
page = BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
Assert(GinPageIsData(page));
Assert(!GinPageIsLeaf(page));
GinPageDeletePostingItem(page, data->parentOffset);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
}
if (!(record->xl_info & XLR_BKP_BLOCK_3) && data->leftBlkno != InvalidBlockNumber)
2006-10-04 00:30:14 +00:00
{
buffer = XLogReadBuffer(data->node, data->leftBlkno, false);
if (BufferIsValid(buffer))
{
page = BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
Assert(GinPageIsData(page));
GinPageGetOpaque(page)->rightlink = data->rightLink;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
}
}
static void
ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record);
Buffer metabuffer;
Page metapage;
Buffer buffer;
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
if (!BufferIsValid(metabuffer))
elog(PANIC, "GIN metapage disappeared");
metapage = BufferGetPage(metabuffer);
if (!XLByteLE(lsn, PageGetLSN(metapage)))
{
memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
PageSetLSN(metapage, lsn);
PageSetTLI(metapage, ThisTimeLineID);
MarkBufferDirty(metabuffer);
}
if (data->ntuples > 0)
{
/*
* insert into tail page
*/
if (!(record->xl_info & XLR_BKP_BLOCK_1))
{
buffer = XLogReadBuffer(data->node, data->metadata.tail, false);
if (BufferIsValid(buffer))
{
Page page = BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
OffsetNumber l,
2011-04-10 11:42:00 -04:00
off = (PageIsEmpty(page)) ? FirstOffsetNumber :
OffsetNumberNext(PageGetMaxOffsetNumber(page));
int i,
2011-04-10 11:42:00 -04:00
tupsize;
IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta));
for (i = 0; i < data->ntuples; i++)
{
tupsize = IndexTupleSize(tuples);
l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
if (l == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page");
tuples = (IndexTuple) (((char *) tuples) + tupsize);
off++;
}
/*
* Increase counter of heap tuples
*/
GinPageGetOpaque(page)->maxoff++;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
}
}
else if (data->prevTail != InvalidBlockNumber)
{
/*
* New tail
*/
buffer = XLogReadBuffer(data->node, data->prevTail, false);
if (BufferIsValid(buffer))
{
Page page = BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
GinPageGetOpaque(page)->rightlink = data->newRightlink;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
}
UnlockReleaseBuffer(metabuffer);
}
static void
ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record);
Buffer buffer;
Page page;
OffsetNumber l,
off = FirstOffsetNumber;
int i,
tupsize;
IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsertListPage));
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
buffer = XLogReadBuffer(data->node, data->blkno, true);
Assert(BufferIsValid(buffer));
page = BufferGetPage(buffer);
GinInitBuffer(buffer, GIN_LIST);
GinPageGetOpaque(page)->rightlink = data->rightlink;
if (data->rightlink == InvalidBlockNumber)
{
/* tail of sublist */
GinPageSetFullRow(page);
GinPageGetOpaque(page)->maxoff = 1;
}
else
{
GinPageGetOpaque(page)->maxoff = 0;
}
for (i = 0; i < data->ntuples; i++)
{
tupsize = IndexTupleSize(tuples);
l = PageAddItem(page, (Item) tuples, tupsize, off, false, false);
if (l == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page");
tuples = (IndexTuple) (((char *) tuples) + tupsize);
}
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
UnlockReleaseBuffer(buffer);
}
static void
ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record)
{
ginxlogDeleteListPages *data = (ginxlogDeleteListPages *) XLogRecGetData(record);
Buffer metabuffer;
Page metapage;
int i;
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
if (!BufferIsValid(metabuffer))
elog(PANIC, "GIN metapage disappeared");
metapage = BufferGetPage(metabuffer);
if (!XLByteLE(lsn, PageGetLSN(metapage)))
{
memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData));
PageSetLSN(metapage, lsn);
PageSetTLI(metapage, ThisTimeLineID);
MarkBufferDirty(metabuffer);
}
for (i = 0; i < data->ndeleted; i++)
{
Buffer buffer = XLogReadBuffer(data->node, data->toDelete[i], false);
if (BufferIsValid(buffer))
{
Page page = BufferGetPage(buffer);
if (!XLByteLE(lsn, PageGetLSN(page)))
{
GinPageGetOpaque(page)->flags = GIN_DELETED;
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
MarkBufferDirty(buffer);
}
UnlockReleaseBuffer(buffer);
}
}
UnlockReleaseBuffer(metabuffer);
}
2006-10-04 00:30:14 +00:00
void
gin_redo(XLogRecPtr lsn, XLogRecord *record)
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 01:32:45 +00:00
/*
* GIN indexes do not require any conflict processing.
*/
RestoreBkpBlocks(lsn, record, false);
topCtx = MemoryContextSwitchTo(opCtx);
2006-10-04 00:30:14 +00:00
switch (info)
{
case XLOG_GIN_CREATE_INDEX:
ginRedoCreateIndex(lsn, record);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_CREATE_PTREE:
ginRedoCreatePTree(lsn, record);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_INSERT:
ginRedoInsert(lsn, record);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_SPLIT:
ginRedoSplit(lsn, record);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_VACUUM_PAGE:
ginRedoVacuumPage(lsn, record);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_DELETE_PAGE:
ginRedoDeletePage(lsn, record);
break;
case XLOG_GIN_UPDATE_META_PAGE:
ginRedoUpdateMetapage(lsn, record);
break;
case XLOG_GIN_INSERT_LISTPAGE:
ginRedoInsertListPage(lsn, record);
break;
case XLOG_GIN_DELETE_LISTPAGE:
ginRedoDeleteListPages(lsn, record);
break;
default:
elog(PANIC, "gin_redo: unknown op code %u", info);
}
MemoryContextSwitchTo(topCtx);
MemoryContextReset(opCtx);
}
static void
2006-10-04 00:30:14 +00:00
desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno)
{
appendStringInfo(buf, "node: %u/%u/%u blkno: %u",
node.spcNode, node.dbNode, node.relNode, blkno);
}
2006-10-04 00:30:14 +00:00
void
gin_desc(StringInfo buf, uint8 xl_info, char *rec)
{
uint8 info = xl_info & ~XLR_INFO_MASK;
2006-10-04 00:30:14 +00:00
switch (info)
{
case XLOG_GIN_CREATE_INDEX:
appendStringInfo(buf, "Create index, ");
desc_node(buf, *(RelFileNode *) rec, GIN_ROOT_BLKNO);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_CREATE_PTREE:
appendStringInfo(buf, "Create posting tree, ");
desc_node(buf, ((ginxlogCreatePostingTree *) rec)->node, ((ginxlogCreatePostingTree *) rec)->blkno);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_INSERT:
appendStringInfo(buf, "Insert item, ");
desc_node(buf, ((ginxlogInsert *) rec)->node, ((ginxlogInsert *) rec)->blkno);
appendStringInfo(buf, " offset: %u nitem: %u isdata: %c isleaf %c isdelete %c updateBlkno:%u",
((ginxlogInsert *) rec)->offset,
((ginxlogInsert *) rec)->nitem,
(((ginxlogInsert *) rec)->isData) ? 'T' : 'F',
(((ginxlogInsert *) rec)->isLeaf) ? 'T' : 'F',
(((ginxlogInsert *) rec)->isDelete) ? 'T' : 'F',
((ginxlogInsert *) rec)->updateBlkno
);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_SPLIT:
appendStringInfo(buf, "Page split, ");
desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno);
appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->isRootSplit) ? 'T' : 'F');
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_VACUUM_PAGE:
appendStringInfo(buf, "Vacuum page, ");
desc_node(buf, ((ginxlogVacuumPage *) rec)->node, ((ginxlogVacuumPage *) rec)->blkno);
break;
2006-10-04 00:30:14 +00:00
case XLOG_GIN_DELETE_PAGE:
appendStringInfo(buf, "Delete page, ");
desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno);
break;
case XLOG_GIN_UPDATE_META_PAGE:
appendStringInfo(buf, "Update metapage, ");
desc_node(buf, ((ginxlogUpdateMeta *) rec)->node, ((ginxlogUpdateMeta *) rec)->metadata.tail);
break;
case XLOG_GIN_INSERT_LISTPAGE:
appendStringInfo(buf, "Insert new list page, ");
desc_node(buf, ((ginxlogInsertListPage *) rec)->node, ((ginxlogInsertListPage *) rec)->blkno);
break;
case XLOG_GIN_DELETE_LISTPAGE:
appendStringInfo(buf, "Delete list pages (%d), ", ((ginxlogDeleteListPages *) rec)->ndeleted);
desc_node(buf, ((ginxlogDeleteListPages *) rec)->node, ((ginxlogDeleteListPages *) rec)->metadata.head);
break;
default:
elog(PANIC, "gin_desc: unknown op code %u", info);
}
}
2006-10-04 00:30:14 +00:00
void
gin_xlog_startup(void)
{
incomplete_splits = NIL;
opCtx = AllocSetContextCreate(CurrentMemoryContext,
2006-10-04 00:30:14 +00:00
"GIN recovery temporary context",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
}
static void
2006-10-04 00:30:14 +00:00
ginContinueSplit(ginIncompleteSplit *split)
{
GinBtreeData btree;
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
GinState ginstate;
Relation reln;
Buffer buffer;
2006-10-04 00:30:14 +00:00
GinBtreeStack stack;
2006-10-04 00:30:14 +00:00
/*
* elog(NOTICE,"ginContinueSplit root:%u l:%u r:%u", split->rootBlkno,
* split->leftBlkno, split->rightBlkno);
*/
buffer = XLogReadBuffer(split->node, split->leftBlkno, false);
/*
* Failure should be impossible here, because we wrote the page earlier.
*/
if (!BufferIsValid(buffer))
elog(PANIC, "ginContinueSplit: left block %u not found",
split->leftBlkno);
reln = CreateFakeRelcacheEntry(split->node);
2006-10-04 00:30:14 +00:00
if (split->rootBlkno == GIN_ROOT_BLKNO)
{
Fix GIN to support null keys, empty and null items, and full index scans. Per my recent proposal(s). Null key datums can now be returned by extractValue and extractQuery functions, and will be stored in the index. Also, placeholder entries are made for indexable items that are NULL or contain no keys according to extractValue. This means that the index is now always complete, having at least one entry for every indexed heap TID, and so we can get rid of the prohibition on full-index scans. A full-index scan is implemented much the same way as partial-match scans were already: we build a bitmap representing all the TIDs found in the index, and then drive the results off that. Also, introduce a concept of a "search mode" that can be requested by extractQuery when the operator requires matching to empty items (this is just as cheap as matching to a single key) or requires a full index scan (which is not so cheap, but it sure beats failing or giving wrong answers). The behavior remains backward compatible for opclasses that don't return any null keys or request a non-default search mode. Using these features, we can now make the GIN index opclass for anyarray behave in a way that matches the actual anyarray operators for &&, <@, @>, and = ... which it failed to do before in assorted corner cases. This commit fixes the core GIN code and ginarrayprocs.c, updates the documentation, and adds some simple regression test cases for the new behaviors using the array operators. The tsearch and contrib GIN opclass support functions still need to be looked over and probably fixed. Another thing I intend to fix separately is that this is pretty inefficient for cases where more than one scan condition needs a full-index search: we'll run duplicate GinScanEntrys, each one of which builds a large bitmap. There is some existing logic to merge duplicate GinScanEntrys but it needs refactoring to make it work for entries belonging to different scan keys. Note that most of gin.h has been split out into a new file gin_private.h, so that gin.h doesn't export anything that's not supposed to be used by GIN opclasses or the rest of the backend. I did quite a bit of other code beautification work as well, mostly fixing comments and choosing more appropriate names for things.
2011-01-07 19:16:24 -05:00
MemSet(&ginstate, 0, sizeof(ginstate));
ginstate.index = reln;
ginPrepareEntryScan(&btree,
InvalidOffsetNumber, (Datum) 0, GIN_CAT_NULL_KEY,
&ginstate);
2006-10-04 00:30:14 +00:00
btree.entry = ginPageGetLinkItup(buffer);
}
else
{
Page page = BufferGetPage(buffer);
ginPrepareDataScan(&btree, reln);
2006-10-04 00:30:14 +00:00
PostingItemSetBlockNumber(&(btree.pitem), split->leftBlkno);
if (GinPageIsLeaf(page))
btree.pitem.key = *(ItemPointerData *) GinDataPageGetItem(page,
GinPageGetOpaque(page)->maxoff);
else
2006-10-04 00:30:14 +00:00
btree.pitem.key = ((PostingItem *) GinDataPageGetItem(page,
GinPageGetOpaque(page)->maxoff))->key;
}
2006-10-04 00:30:14 +00:00
btree.rightblkno = split->rightBlkno;
stack.blkno = split->leftBlkno;
stack.buffer = buffer;
stack.off = InvalidOffsetNumber;
stack.parent = NULL;
ginFindParents(&btree, &stack, split->rootBlkno);
ginInsertValue(&btree, stack.parent, NULL);
FreeFakeRelcacheEntry(reln);
2006-10-04 00:30:14 +00:00
UnlockReleaseBuffer(buffer);
}
2006-10-04 00:30:14 +00:00
void
gin_xlog_cleanup(void)
{
ListCell *l;
MemoryContext topCtx;
topCtx = MemoryContextSwitchTo(opCtx);
foreach(l, incomplete_splits)
{
ginIncompleteSplit *split = (ginIncompleteSplit *) lfirst(l);
2006-10-04 00:30:14 +00:00
ginContinueSplit(split);
MemoryContextReset(opCtx);
}
MemoryContextSwitchTo(topCtx);
MemoryContextDelete(opCtx);
incomplete_splits = NIL;
}
bool
gin_safe_restartpoint(void)
{
if (incomplete_splits)
return false;
return true;
}