diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 4820f76e3bb..997d2721d56 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -486,6 +486,28 @@ normal running after recovery has completed. This is a key capability because it allows running applications to continue while the standby changes state into a normally running server. +The interlocking required to avoid returning incorrect results from +non-MVCC scans is not required on standby nodes. That is because +HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(), +HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only +ever used during write transactions, which cannot exist on the standby. +MVCC scans are already protected by definition, so HeapTupleSatisfiesMVCC() +is not a problem. That leaves concern only for HeapTupleSatisfiesToast(). +HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's +because it doesn't need to - if the main heap row is visible then the +toast rows will also be visible. So as long as we follow a toast +pointer from a visible (live) tuple the corresponding toast rows +will also be visible, so we do not need to recheck MVCC on them. +There is one minor exception, which is that the optimizer sometimes +looks at the boundaries of value ranges using SnapshotDirty, which +could result in returning a newer value for query statistics; this +would affect the query plan in rare cases, but not the correctness. +The risk window is small since the stats look at the min and max values +in the index, so the scan retrieves a tid then immediately uses it +to look in the heap. It is unlikely that the tid could have been +deleted, vacuumed and re-inserted in the time taken to look in the heap +via direct tid access. So we ignore that scan type as a problem. + Other Things That Are Handy to Know ----------------------------------- diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 36dc6c278ea..469c2ed83fd 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -811,6 +811,10 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* + * Check to see if we need to issue one final WAL record for this index, + * which may be needed for correctness on a hot standby node when non-MVCC + * index scans could take place. + * * If the WAL is replayed in hot standby, the replay process needs to get * cleanup locks on all index leaf pages, just as we've been doing here. * However, we won't issue any WAL records about pages that have no items @@ -1018,10 +1022,13 @@ restart: if (ndeletable > 0) { /* - * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an - * instruction to the replay code to get cleanup lock on all pages - * between the previous lastBlockVacuumed and this page. This - * ensures that WAL replay locks all leaf pages at some point. + * Notice that the issued XLOG_BTREE_VACUUM WAL record includes + * all information to the replay code to allow it to get a cleanup + * lock on all pages between the previous lastBlockVacuumed and + * this page. This ensures that WAL replay locks all leaf pages at + * some point, which is important should non-MVCC scans be + * requested. This is currently unused on standby, but we record + * it anyway, so that the WAL contains the required information. * * Since we can visit leaf pages out-of-order when recursing, * replay might end up locking such pages an extra time, but it diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 2debb870bd0..42d65fa385f 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -479,7 +479,24 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) Page page; BTPageOpaque opaque; +#ifdef UNUSED /* + * This section of code is thought to be no longer needed, after analysis + * of the calling paths. It is retained to allow the code to be reinstated + * if a flaw is revealed in that thinking. + * + * If we are running non-MVCC scans using this index we need to do some + * additional work to ensure correctness, which is known as a "pin scan" + * described in more detail in next paragraphs. We used to do the extra + * work in all cases, whereas we now avoid that work in most cases. If + * lastBlockVacuumed is set to InvalidBlockNumber then we skip the + * additional work required for the pin scan. + * + * Avoiding this extra work is important since it requires us to touch + * every page in the index, so is an O(N) operation. Worse, it is an + * operation performed in the foreground during redo, so it delays + * replication directly. + * * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of @@ -500,7 +517,7 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ - if (HotStandbyActiveInReplay()) + if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) { BlockNumber blkno; @@ -517,7 +534,8 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is - * not in shared_buffers we confirm it as unpinned. + * not in shared_buffers we confirm it as unpinned. Optimizing + * this is now moot, since in most cases we avoid the scan. */ buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); @@ -528,6 +546,7 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) } } } +#endif /* * If we have a full-page image, restore it (using a cleanup lock) and diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index f2817590c41..09708c01c81 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -342,8 +342,10 @@ typedef struct xl_btree_reuse_page * The WAL record can represent deletion of any number of index tuples on a * single index page when executed by VACUUM. * - * The correctness requirement for applying these changes during recovery is - * that we must do one of these two things for every block in the index: + * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber. + * For a non-MVCC index scans there is an additional correctness requirement + * for applying these changes during recovery, which is that we must do one + * of these two things for every block in the index: * * lock the block for cleanup and apply any required changes * * EnsureBlockUnpinned() * The purpose of this is to ensure that no index scans started before we