*** pgsql/src/backend/access/nbtree/nbtree.c 2009/07/29 20:56:18 1.172 --- pgsql/src/backend/access/nbtree/nbtree.c 2009/12/19 01:32:33 1.173 *************** *** 12,18 **** * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION ! * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.171 2009/06/11 14:48:54 momjian Exp $ * *------------------------------------------------------------------------- */ --- 12,18 ---- * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION ! * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.172 2009/07/29 20:56:18 tgl Exp $ * *------------------------------------------------------------------------- */ *************** typedef struct *** 57,63 **** IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; ! BlockNumber lastUsedPage; BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; --- 57,64 ---- IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; ! BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */ ! BlockNumber lastUsedPage; /* blkno of last non-recyclable page */ BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; *************** btvacuumscan(IndexVacuumInfo *info, Inde *** 629,634 **** --- 630,636 ---- vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; + vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastUsedPage = BTREE_METAPAGE; vstate.totFreePages = 0; *************** btvacuumscan(IndexVacuumInfo *info, Inde *** 705,710 **** --- 707,738 ---- num_pages = new_pages; } + /* + * InHotStandby we need to scan right up to the end of the index for + * correct locking, so we may need to write a WAL record for the final + * block in the index if it was not vacuumed. It's possible that VACUUMing + * has actually removed zeroed pages at the end of the index so we need to + * take care to issue the record for last actual block and not for the + * last block that was scanned. Ignore empty indexes. + */ + if (XLogStandbyInfoActive() && + num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) + { + Buffer buf; + + /* + * We can't use _bt_getbuf() here because it always applies + * _bt_checkpage(), which will barf on an all-zero page. We want to + * recycle all-zero pages, not fail. Also, we want to use a nondefault + * buffer access strategy. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, + info->strategy); + LockBufferForCleanup(buf); + _bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed); + _bt_relbuf(rel, buf); + } + MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ *************** restart: *** 847,852 **** --- 875,900 ---- itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); + + /* + * During Hot Standby we currently assume that XLOG_BTREE_VACUUM + * records do not produce conflicts. That is only true as long + * as the callback function depends only upon whether the index + * tuple refers to heap tuples removed in the initial heap scan. + * When vacuum starts it derives a value of OldestXmin. Backends + * taking later snapshots could have a RecentGlobalXmin with a + * later xid than the vacuum's OldestXmin, so it is possible that + * row versions deleted after OldestXmin could be marked as killed + * by other backends. The callback function *could* look at the + * index tuple state in isolation and decide to delete the index + * tuple, though currently it does not. If it ever did, we would + * need to reconsider whether XLOG_BTREE_VACUUM records should + * cause conflicts. If they did cause conflicts they would be + * fairly harsh conflicts, since we haven't yet worked out a way + * to pass a useful value for latestRemovedXid on the + * XLOG_BTREE_VACUUM records. This applies to *any* type of index + * that marks index tuples as killed. + */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } *************** restart: *** 858,864 **** */ if (ndeletable > 0) { ! _bt_delitems(rel, buf, deletable, ndeletable); stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); --- 906,924 ---- */ if (ndeletable > 0) { ! BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); ! ! _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed); ! ! /* ! * Keep track of the block number of the lastBlockVacuumed, so ! * we can scan those blocks as well during WAL replay. This then ! * provides concurrency protection and allows btrees to be used ! * while in recovery. ! */ ! if (lastBlockVacuumed > vstate->lastBlockVacuumed) ! vstate->lastBlockVacuumed = lastBlockVacuumed; ! stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page);