summaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree')
-rw-r--r--src/backend/access/nbtree/nbtcompare.c201
-rw-r--r--src/backend/access/nbtree/nbtinsert.c2923
-rw-r--r--src/backend/access/nbtree/nbtpage.c902
-rw-r--r--src/backend/access/nbtree/nbtree.c927
-rw-r--r--src/backend/access/nbtree/nbtscan.c267
-rw-r--r--src/backend/access/nbtree/nbtsearch.c2617
-rw-r--r--src/backend/access/nbtree/nbtsort.c1926
-rw-r--r--src/backend/access/nbtree/nbtstrat.c156
-rw-r--r--src/backend/access/nbtree/nbtutils.c623
9 files changed, 5423 insertions, 5119 deletions
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c
index f005509be07..0312bbb69d7 100644
--- a/src/backend/access/nbtree/nbtcompare.c
+++ b/src/backend/access/nbtree/nbtcompare.c
@@ -1,22 +1,22 @@
/*-------------------------------------------------------------------------
*
* nbtcompare.c--
- * Comparison functions for btree access method.
+ * Comparison functions for btree access method.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.10 1997/06/11 05:20:05 vadim Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.11 1997/09/07 04:38:39 momjian Exp $
*
- * NOTES
- * These functions are stored in pg_amproc. For each operator class
- * defined on btrees, they compute
+ * NOTES
+ * These functions are stored in pg_amproc. For each operator class
+ * defined on btrees, they compute
*
- * compare(a, b):
- * < 0 if a < b,
- * = 0 if a == b,
- * > 0 if a > b.
+ * compare(a, b):
+ * < 0 if a < b,
+ * = 0 if a == b,
+ * > 0 if a > b.
*-------------------------------------------------------------------------
*/
@@ -30,168 +30,171 @@
int32
btint2cmp(int16 a, int16 b)
{
- return ((int32) (a - b));
+ return ((int32) (a - b));
}
int32
btint4cmp(int32 a, int32 b)
{
- return (a - b);
+ return (a - b);
}
int32
btint24cmp(int16 a, int32 b)
{
- return (((int32) a) - b);
+ return (((int32) a) - b);
}
int32
btint42cmp(int32 a, int16 b)
{
- return (a - ((int32) b));
+ return (a - ((int32) b));
}
int32
btfloat4cmp(float32 a, float32 b)
{
- if (*a > *b)
- return (1);
- else if (*a == *b)
- return (0);
- else
- return (-1);
+ if (*a > *b)
+ return (1);
+ else if (*a == *b)
+ return (0);
+ else
+ return (-1);
}
int32
btfloat8cmp(float64 a, float64 b)
{
- if (*a > *b)
- return (1);
- else if (*a == *b)
- return (0);
- else
- return (-1);
+ if (*a > *b)
+ return (1);
+ else if (*a == *b)
+ return (0);
+ else
+ return (-1);
}
int32
btoidcmp(Oid a, Oid b)
{
- if (a > b)
- return (1);
- else if (a == b)
- return (0);
- else
- return (-1);
+ if (a > b)
+ return (1);
+ else if (a == b)
+ return (0);
+ else
+ return (-1);
}
int32
btabstimecmp(AbsoluteTime a, AbsoluteTime b)
{
- if (AbsoluteTimeIsBefore(a, b))
- return (-1);
- else if (AbsoluteTimeIsBefore(b, a))
- return (1);
- else
- return (0);
+ if (AbsoluteTimeIsBefore(a, b))
+ return (-1);
+ else if (AbsoluteTimeIsBefore(b, a))
+ return (1);
+ else
+ return (0);
}
int32
btcharcmp(char a, char b)
{
- return ((int32) ((uint8)a - (uint8)b));
+ return ((int32) ((uint8) a - (uint8) b));
}
int32
btchar2cmp(uint16 a, uint16 b)
{
- return (strncmp((char *) &a, (char *) &b, 2));
+ return (strncmp((char *) &a, (char *) &b, 2));
}
int32
btchar4cmp(uint32 a, uint32 b)
{
- return (strncmp((char *) &a, (char *) &b, 4));
+ return (strncmp((char *) &a, (char *) &b, 4));
}
int32
btchar8cmp(char *a, char *b)
{
- return (strncmp(a, b, 8));
+ return (strncmp(a, b, 8));
}
int32
btchar16cmp(char *a, char *b)
{
- return (strncmp(a, b, 16));
+ return (strncmp(a, b, 16));
}
int32
-btnamecmp(NameData *a, NameData *b)
+btnamecmp(NameData * a, NameData * b)
{
- return (strncmp(a->data, b->data, NAMEDATALEN));
+ return (strncmp(a->data, b->data, NAMEDATALEN));
}
int32
-bttextcmp(struct varlena *a, struct varlena *b)
+bttextcmp(struct varlena * a, struct varlena * b)
{
- int res;
- unsigned char *ap, *bp;
+ int res;
+ unsigned char *ap,
+ *bp;
#ifdef USE_LOCALE
- int la = VARSIZE(a) - VARHDRSZ;
- int lb = VARSIZE(b) - VARHDRSZ;
-
- ap = (unsigned char *) palloc (la + 1);
- bp = (unsigned char *) palloc (lb + 1);
-
- memcpy(ap, VARDATA(a), la);
- *(ap + la) = '\0';
- memcpy(bp, VARDATA(b), lb);
- *(bp + lb) = '\0';
-
- res = strcoll (ap, bp);
-
- pfree (ap);
- pfree (bp);
+ int la = VARSIZE(a) - VARHDRSZ;
+ int lb = VARSIZE(b) - VARHDRSZ;
+
+ ap = (unsigned char *) palloc(la + 1);
+ bp = (unsigned char *) palloc(lb + 1);
+
+ memcpy(ap, VARDATA(a), la);
+ *(ap + la) = '\0';
+ memcpy(bp, VARDATA(b), lb);
+ *(bp + lb) = '\0';
+
+ res = strcoll(ap, bp);
+
+ pfree(ap);
+ pfree(bp);
#else
- int len = VARSIZE(a);
-
- /* len is the length of the shorter of the two strings */
- if ( len > VARSIZE(b) )
- len = VARSIZE(b);
-
- len -= VARHDRSZ;
-
- ap = (unsigned char *) VARDATA(a);
- bp = (unsigned char *) VARDATA(b);
-
- /*
- * If the two strings differ in the first len bytes, or if they're
- * the same in the first len bytes and they're both len bytes long,
- * we're done.
- */
-
- res = 0;
- if (len > 0) {
- do {
- res = (int) (*ap++ - *bp++);
- len--;
- } while (res == 0 && len != 0);
- }
+ int len = VARSIZE(a);
+
+ /* len is the length of the shorter of the two strings */
+ if (len > VARSIZE(b))
+ len = VARSIZE(b);
+
+ len -= VARHDRSZ;
+
+ ap = (unsigned char *) VARDATA(a);
+ bp = (unsigned char *) VARDATA(b);
+
+ /*
+ * If the two strings differ in the first len bytes, or if they're the
+ * same in the first len bytes and they're both len bytes long, we're
+ * done.
+ */
+
+ res = 0;
+ if (len > 0)
+ {
+ do
+ {
+ res = (int) (*ap++ - *bp++);
+ len--;
+ } while (res == 0 && len != 0);
+ }
#endif
-
- if (res != 0 || VARSIZE(a) == VARSIZE(b))
- return (res);
-
- /*
- * The two strings are the same in the first len bytes, and they
- * are of different lengths.
- */
-
- if (VARSIZE(a) < VARSIZE(b))
- return (-1);
- else
- return (1);
+
+ if (res != 0 || VARSIZE(a) == VARSIZE(b))
+ return (res);
+
+ /*
+ * The two strings are the same in the first len bytes, and they are
+ * of different lengths.
+ */
+
+ if (VARSIZE(a) < VARSIZE(b))
+ return (-1);
+ else
+ return (1);
}
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 4dfa6fd2558..4bafbc2ddbb 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1,13 +1,13 @@
/*-------------------------------------------------------------------------
*
* btinsert.c--
- * Item insertion in Lehman and Yao btrees for Postgres.
+ * Item insertion in Lehman and Yao btrees for Postgres.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.17 1997/08/20 14:53:15 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.18 1997/09/07 04:38:45 momjian Exp $
*
*-------------------------------------------------------------------------
*/
@@ -22,1386 +22,1437 @@
#include <fmgr.h>
#ifndef HAVE_MEMMOVE
-# include <regex/utils.h>
+#include <regex/utils.h>
#else
-# include <string.h>
+#include <string.h>
#endif
static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
-static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright);
+static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright);
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
-static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
+static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
-static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
-static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, BTItem oldItem, BTItem newItem);
-static bool _bt_isequal (TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey);
+static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
+static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, BTItem oldItem, BTItem newItem);
+static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey);
/*
- * _bt_doinsert() -- Handle insertion of a single btitem in the tree.
+ * _bt_doinsert() -- Handle insertion of a single btitem in the tree.
*
- * This routine is called by the public interface routines, btbuild
- * and btinsert. By here, btitem is filled in, and has a unique
- * (xid, seqno) pair.
+ * This routine is called by the public interface routines, btbuild
+ * and btinsert. By here, btitem is filled in, and has a unique
+ * (xid, seqno) pair.
*/
InsertIndexResult
_bt_doinsert(Relation rel, BTItem btitem, bool index_is_unique, Relation heapRel)
{
- ScanKey itup_scankey;
- IndexTuple itup;
- BTStack stack;
- Buffer buf;
- BlockNumber blkno;
- int natts = rel->rd_rel->relnatts;
- InsertIndexResult res;
-
- itup = &(btitem->bti_itup);
-
- /* we need a scan key to do our search, so build one */
- itup_scankey = _bt_mkscankey(rel, itup);
-
- /* find the page containing this key */
- stack = _bt_search(rel, natts, itup_scankey, &buf);
-
- blkno = BufferGetBlockNumber(buf);
-
- /* trade in our read lock for a write lock */
- _bt_relbuf(rel, buf, BT_READ);
- buf = _bt_getbuf(rel, blkno, BT_WRITE);
-
- /*
- * If the page was split between the time that we surrendered our
- * read lock and acquired our write lock, then this page may no
- * longer be the right place for the key we want to insert. In this
- * case, we need to move right in the tree. See Lehman and Yao for
- * an excruciatingly precise description.
- */
-
- buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
-
- /* if we're not allowing duplicates, make sure the key isn't */
- /* already in the node */
- if ( index_is_unique )
- {
- OffsetNumber offset, maxoff;
- Page page;
+ ScanKey itup_scankey;
+ IndexTuple itup;
+ BTStack stack;
+ Buffer buf;
+ BlockNumber blkno;
+ int natts = rel->rd_rel->relnatts;
+ InsertIndexResult res;
- page = BufferGetPage(buf);
- maxoff = PageGetMaxOffsetNumber (page);
+ itup = &(btitem->bti_itup);
+
+ /* we need a scan key to do our search, so build one */
+ itup_scankey = _bt_mkscankey(rel, itup);
+
+ /* find the page containing this key */
+ stack = _bt_search(rel, natts, itup_scankey, &buf);
- offset = _bt_binsrch(rel, buf, natts, itup_scankey, BT_DESCENT);
+ blkno = BufferGetBlockNumber(buf);
- /* make sure the offset we're given points to an actual */
- /* key on the page before trying to compare it */
- if ( !PageIsEmpty (page) && offset <= maxoff )
+ /* trade in our read lock for a write lock */
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+
+ /*
+ * If the page was split between the time that we surrendered our read
+ * lock and acquired our write lock, then this page may no longer be
+ * the right place for the key we want to insert. In this case, we
+ * need to move right in the tree. See Lehman and Yao for an
+ * excruciatingly precise description.
+ */
+
+ buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
+
+ /* if we're not allowing duplicates, make sure the key isn't */
+ /* already in the node */
+ if (index_is_unique)
{
- TupleDesc itupdesc;
- BTItem btitem;
- IndexTuple itup;
- HeapTuple htup;
- BTPageOpaque opaque;
- Buffer nbuf;
- BlockNumber blkno;
-
- itupdesc = RelationGetTupleDescriptor(rel);
- nbuf = InvalidBuffer;
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- /*
- * _bt_compare returns 0 for (1,NULL) and (1,NULL) -
- * this's how we handling NULLs - and so we must not use
- * _bt_compare in real comparison, but only for
- * ordering/finding items on pages. - vadim 03/24/97
-
- while ( !_bt_compare (rel, itupdesc, page,
- natts, itup_scankey, offset) )
- */
- while ( _bt_isequal (itupdesc, page, offset, natts, itup_scankey) )
- { /* they're equal */
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offset));
- itup = &(btitem->bti_itup);
- htup = heap_fetch (heapRel, SelfTimeQual, &(itup->t_tid), NULL);
- if ( htup != (HeapTuple) NULL )
- { /* it is a duplicate */
- elog(WARN, "Cannot insert a duplicate key into a unique index.");
- }
- /* get next offnum */
- if ( offset < maxoff )
- {
- offset = OffsetNumberNext(offset);
- }
- else
- { /* move right ? */
- if ( P_RIGHTMOST (opaque) )
- break;
- if ( !_bt_isequal (itupdesc, page, P_HIKEY,
- natts, itup_scankey) )
- break;
- /*
- * min key of the right page is the same,
- * ooh - so many dead duplicates...
- */
- blkno = opaque->btpo_next;
- if ( nbuf != InvalidBuffer )
- _bt_relbuf (rel, nbuf, BT_READ);
- for (nbuf = InvalidBuffer; ; )
- {
- nbuf = _bt_getbuf (rel, blkno, BT_READ);
- page = BufferGetPage (nbuf);
- maxoff = PageGetMaxOffsetNumber(page);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- offset = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
- if ( ! PageIsEmpty (page) && offset <= maxoff )
- { /* Found some key */
- break;
- }
- else
- { /* Empty or "pseudo"-empty page - get next */
- blkno = opaque->btpo_next;
- _bt_relbuf (rel, nbuf, BT_READ);
- nbuf = InvalidBuffer;
- if ( blkno == P_NONE )
- break;
+ OffsetNumber offset,
+ maxoff;
+ Page page;
+
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ offset = _bt_binsrch(rel, buf, natts, itup_scankey, BT_DESCENT);
+
+ /* make sure the offset we're given points to an actual */
+ /* key on the page before trying to compare it */
+ if (!PageIsEmpty(page) && offset <= maxoff)
+ {
+ TupleDesc itupdesc;
+ BTItem btitem;
+ IndexTuple itup;
+ HeapTuple htup;
+ BTPageOpaque opaque;
+ Buffer nbuf;
+ BlockNumber blkno;
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ nbuf = InvalidBuffer;
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /*
+ * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
+ * how we handling NULLs - and so we must not use _bt_compare
+ * in real comparison, but only for ordering/finding items on
+ * pages. - vadim 03/24/97
+ *
+ * while ( !_bt_compare (rel, itupdesc, page, natts,
+ * itup_scankey, offset) )
+ */
+ while (_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
+ { /* they're equal */
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offset));
+ itup = &(btitem->bti_itup);
+ htup = heap_fetch(heapRel, SelfTimeQual, &(itup->t_tid), NULL);
+ if (htup != (HeapTuple) NULL)
+ { /* it is a duplicate */
+ elog(WARN, "Cannot insert a duplicate key into a unique index.");
+ }
+ /* get next offnum */
+ if (offset < maxoff)
+ {
+ offset = OffsetNumberNext(offset);
+ }
+ else
+ { /* move right ? */
+ if (P_RIGHTMOST(opaque))
+ break;
+ if (!_bt_isequal(itupdesc, page, P_HIKEY,
+ natts, itup_scankey))
+ break;
+
+ /*
+ * min key of the right page is the same, ooh - so
+ * many dead duplicates...
+ */
+ blkno = opaque->btpo_next;
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf, BT_READ);
+ for (nbuf = InvalidBuffer;;)
+ {
+ nbuf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(nbuf);
+ maxoff = PageGetMaxOffsetNumber(page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ offset = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ if (!PageIsEmpty(page) && offset <= maxoff)
+ { /* Found some key */
+ break;
+ }
+ else
+ { /* Empty or "pseudo"-empty page - get next */
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, nbuf, BT_READ);
+ nbuf = InvalidBuffer;
+ if (blkno == P_NONE)
+ break;
+ }
+ }
+ if (nbuf == InvalidBuffer)
+ break;
+ }
}
- }
- if ( nbuf == InvalidBuffer )
- break;
- }
- }
- if ( nbuf != InvalidBuffer )
- _bt_relbuf(rel, nbuf, BT_READ);
+ if (nbuf != InvalidBuffer)
+ _bt_relbuf(rel, nbuf, BT_READ);
+ }
}
- }
-
- /* do the insertion */
- res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
- btitem, (BTItem) NULL);
-
- /* be tidy */
- _bt_freestack(stack);
- _bt_freeskey(itup_scankey);
-
- return (res);
+
+ /* do the insertion */
+ res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
+ btitem, (BTItem) NULL);
+
+ /* be tidy */
+ _bt_freestack(stack);
+ _bt_freeskey(itup_scankey);
+
+ return (res);
}
/*
- * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
+ * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
*
- * This recursive procedure does the following things:
+ * This recursive procedure does the following things:
*
- * + if necessary, splits the target page.
- * + finds the right place to insert the tuple (taking into
- * account any changes induced by a split).
- * + inserts the tuple.
- * + if the page was split, pops the parent stack, and finds the
- * right place to insert the new child pointer (by walking
- * right using information stored in the parent stack).
- * + invoking itself with the appropriate tuple for the right
- * child page on the parent.
+ * + if necessary, splits the target page.
+ * + finds the right place to insert the tuple (taking into
+ * account any changes induced by a split).
+ * + inserts the tuple.
+ * + if the page was split, pops the parent stack, and finds the
+ * right place to insert the new child pointer (by walking
+ * right using information stored in the parent stack).
+ * + invoking itself with the appropriate tuple for the right
+ * child page on the parent.
*
- * On entry, we must have the right buffer on which to do the
- * insertion, and the buffer must be pinned and locked. On return,
- * we will have dropped both the pin and the write lock on the buffer.
+ * On entry, we must have the right buffer on which to do the
+ * insertion, and the buffer must be pinned and locked. On return,
+ * we will have dropped both the pin and the write lock on the buffer.
*
- * The locking interactions in this code are critical. You should
- * grok Lehman and Yao's paper before making any changes. In addition,
- * you need to understand how we disambiguate duplicate keys in this
- * implementation, in order to be able to find our location using
- * L&Y "move right" operations. Since we may insert duplicate user
- * keys, and since these dups may propogate up the tree, we use the
- * 'afteritem' parameter to position ourselves correctly for the
- * insertion on internal pages.
+ * The locking interactions in this code are critical. You should
+ * grok Lehman and Yao's paper before making any changes. In addition,
+ * you need to understand how we disambiguate duplicate keys in this
+ * implementation, in order to be able to find our location using
+ * L&Y "move right" operations. Since we may insert duplicate user
+ * keys, and since these dups may propogate up the tree, we use the
+ * 'afteritem' parameter to position ourselves correctly for the
+ * insertion on internal pages.
*/
-static InsertIndexResult
+static InsertIndexResult
_bt_insertonpg(Relation rel,
- Buffer buf,
- BTStack stack,
- int keysz,
- ScanKey scankey,
- BTItem btitem,
- BTItem afteritem)
+ Buffer buf,
+ BTStack stack,
+ int keysz,
+ ScanKey scankey,
+ BTItem btitem,
+ BTItem afteritem)
{
- InsertIndexResult res;
- Page page;
- BTPageOpaque lpageop;
- BlockNumber itup_blkno;
- OffsetNumber itup_off;
- OffsetNumber firstright = InvalidOffsetNumber;
- int itemsz;
- bool do_split = false;
- bool keys_equal = false;
-
- page = BufferGetPage(buf);
- lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- itemsz = IndexTupleDSize(btitem->bti_itup)
- + (sizeof(BTItemData) - sizeof(IndexTupleData));
-
- itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this
- but we need to be consistent */
- /*
- * If we have to insert item on the leftmost page which is the first
- * page in the chain of duplicates then:
- * 1. if scankey == hikey (i.e. - new duplicate item) then
- * insert it here;
- * 2. if scankey < hikey then:
- * 2.a if there is duplicate key(s) here - we force splitting;
- * 2.b else - we may "eat" this page from duplicates chain.
- */
- if ( lpageop->btpo_flags & BTP_CHAIN )
- {
- OffsetNumber maxoff = PageGetMaxOffsetNumber (page);
- ItemId hitemid;
- BTItem hitem;
-
- Assert ( !P_RIGHTMOST(lpageop) );
- hitemid = PageGetItemId(page, P_HIKEY);
- hitem = (BTItem) PageGetItem(page, hitemid);
- if ( maxoff > P_HIKEY &&
- !_bt_itemcmp (rel, keysz, hitem,
- (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)),
- BTEqualStrategyNumber) )
- elog (FATAL, "btree: bad key on the page in the chain of duplicates");
-
- if ( !_bt_skeycmp (rel, keysz, scankey, page, hitemid,
- BTEqualStrategyNumber) )
- {
- if ( !P_LEFTMOST(lpageop) )
- elog (FATAL, "btree: attempt to insert bad key on the non-leftmost page in the chain of duplicates");
- if ( !_bt_skeycmp (rel, keysz, scankey, page, hitemid,
- BTLessStrategyNumber) )
- elog (FATAL, "btree: attempt to insert higher key on the leftmost page in the chain of duplicates");
- if ( maxoff > P_HIKEY ) /* have duplicate(s) */
- {
- firstright = P_FIRSTKEY;
- do_split = true;
- }
- else /* "eat" page */
- {
- Buffer pbuf;
- Page ppage;
-
- itup_blkno = BufferGetBlockNumber(buf);
- itup_off = PageAddItem(page, (Item) btitem, itemsz,
- P_FIRSTKEY, LP_USED);
- if ( itup_off == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add item");
- lpageop->btpo_flags &= ~BTP_CHAIN;
- pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
- ppage = BufferGetPage(pbuf);
- PageIndexTupleDelete(ppage, stack->bts_offset);
- pfree(stack->bts_btitem);
- stack->bts_btitem = _bt_formitem(&(btitem->bti_itup));
- ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
- itup_blkno, P_HIKEY);
- _bt_wrtbuf(rel, buf);
- res = _bt_insertonpg(rel, pbuf, stack->bts_parent,
- keysz, scankey, stack->bts_btitem,
- NULL);
- ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
- return (res);
- }
- }
- else
- {
- keys_equal = true;
- if ( PageGetFreeSpace(page) < itemsz )
- do_split = true;
- }
- }
- else if ( PageGetFreeSpace(page) < itemsz )
- do_split = true;
- else if ( PageGetFreeSpace(page) < 3*itemsz + 2*sizeof(ItemIdData) )
- {
- OffsetNumber offnum = (P_RIGHTMOST(lpageop)) ? P_HIKEY : P_FIRSTKEY;
- OffsetNumber maxoff = PageGetMaxOffsetNumber (page);
- ItemId itid;
- BTItem previtem, chkitem;
- Size maxsize;
- Size currsize;
-
- itid = PageGetItemId(page, offnum);
- previtem = (BTItem) PageGetItem(page, itid);
- maxsize = currsize = (ItemIdGetLength(itid) + sizeof(ItemIdData));
- for (offnum = OffsetNumberNext(offnum);
- offnum <= maxoff; offnum = OffsetNumberNext(offnum) )
- {
- itid = PageGetItemId(page, offnum);
- chkitem = (BTItem) PageGetItem(page, itid);
- if ( !_bt_itemcmp (rel, keysz, previtem, chkitem,
- BTEqualStrategyNumber) )
- {
- if ( currsize > maxsize )
- maxsize = currsize;
- currsize = 0;
- previtem = chkitem;
- }
- currsize += (ItemIdGetLength(itid) + sizeof(ItemIdData));
- }
- if ( currsize > maxsize )
- maxsize = currsize;
- maxsize += sizeof (PageHeaderData) +
- DOUBLEALIGN (sizeof (BTPageOpaqueData));
- if ( maxsize >= PageGetPageSize (page) / 2 )
- do_split = true;
- }
-
- if ( do_split )
- {
- Buffer rbuf;
- Page rpage;
- BTItem ritem;
- BlockNumber rbknum;
- BTPageOpaque rpageop;
- Buffer pbuf;
- Page ppage;
- BTPageOpaque ppageop;
- BlockNumber bknum = BufferGetBlockNumber(buf);
- BTItem lowLeftItem;
- OffsetNumber maxoff;
- bool shifted = false;
- bool left_chained = ( lpageop->btpo_flags & BTP_CHAIN ) ? true : false;
-
- /*
- * If we have to split leaf page in the chain of duplicates by
- * new duplicate then we try to look at our right sibling first.
- */
- if ( ( lpageop->btpo_flags & BTP_CHAIN ) &&
- ( lpageop->btpo_flags & BTP_LEAF ) && keys_equal )
- {
- bool use_left = true;
-
- rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
- rpage = BufferGetPage(rbuf);
- rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
- if ( !P_RIGHTMOST (rpageop) ) /* non-rightmost page */
- { /*
- * If we have the same hikey here then it's
- * yet another page in chain.
- */
- if ( _bt_skeycmp (rel, keysz, scankey, rpage,
- PageGetItemId(rpage, P_HIKEY),
- BTEqualStrategyNumber) )
- {
- if ( !( rpageop->btpo_flags & BTP_CHAIN ) )
- elog (FATAL, "btree: lost page in the chain of duplicates");
- }
- else if ( _bt_skeycmp (rel, keysz, scankey, rpage,
- PageGetItemId(rpage, P_HIKEY),
- BTGreaterStrategyNumber) )
- elog (FATAL, "btree: hikey is out of order");
- else if ( rpageop->btpo_flags & BTP_CHAIN )
- /*
- * If hikey > scankey then it's last page in chain and
- * BTP_CHAIN must be OFF
- */
- elog (FATAL, "btree: lost last page in the chain of duplicates");
-
- /* if there is room here then we use this page. */
- if ( PageGetFreeSpace (rpage) > itemsz )
- use_left = false;
- }
- else /* rightmost page */
- {
- Assert ( !( rpageop->btpo_flags & BTP_CHAIN ) );
- /* if there is room here then we use this page. */
- if ( PageGetFreeSpace (rpage) > itemsz )
- use_left = false;
- }
- if ( !use_left ) /* insert on the right page */
- {
- _bt_relbuf(rel, buf, BT_WRITE);
- return ( _bt_insertonpg(rel, rbuf, stack, keysz,
- scankey, btitem, afteritem) );
- }
- _bt_relbuf(rel, rbuf, BT_WRITE);
- }
+ InsertIndexResult res;
+ Page page;
+ BTPageOpaque lpageop;
+ BlockNumber itup_blkno;
+ OffsetNumber itup_off;
+ OffsetNumber firstright = InvalidOffsetNumber;
+ int itemsz;
+ bool do_split = false;
+ bool keys_equal = false;
+
+ page = BufferGetPage(buf);
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ itemsz = IndexTupleDSize(btitem->bti_itup)
+ + (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+ itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do
+ * this but we need to be
+ * consistent */
+
/*
- * If after splitting un-chained page we'll got chain of pages
- * with duplicates then we want to know
- * 1. on which of two pages new btitem will go (current
- * _bt_findsplitloc is quite bad);
- * 2. what parent (if there's one) thinking about it
- * (remember about deletions)
+ * If we have to insert item on the leftmost page which is the first
+ * page in the chain of duplicates then: 1. if scankey == hikey (i.e.
+ * - new duplicate item) then insert it here; 2. if scankey < hikey
+ * then: 2.a if there is duplicate key(s) here - we force splitting;
+ * 2.b else - we may "eat" this page from duplicates chain.
*/
- else if ( !( lpageop->btpo_flags & BTP_CHAIN ) )
+ if (lpageop->btpo_flags & BTP_CHAIN)
{
- OffsetNumber start = ( P_RIGHTMOST(lpageop) ) ? P_HIKEY : P_FIRSTKEY;
- Size llimit;
-
- maxoff = PageGetMaxOffsetNumber (page);
- llimit = PageGetPageSize(page) - sizeof (PageHeaderData) -
- DOUBLEALIGN (sizeof (BTPageOpaqueData))
- + sizeof(ItemIdData);
- llimit /= 2;
- firstright = _bt_findsplitloc(rel, page, start, maxoff, llimit);
-
- if ( _bt_itemcmp (rel, keysz,
- (BTItem) PageGetItem(page, PageGetItemId(page, start)),
- (BTItem) PageGetItem(page, PageGetItemId(page, firstright)),
- BTEqualStrategyNumber) )
- {
- if ( _bt_skeycmp (rel, keysz, scankey, page,
- PageGetItemId(page, firstright),
- BTLessStrategyNumber) )
- /*
- * force moving current items to the new page:
- * new item will go on the current page.
- */
- firstright = start;
- else
- /*
- * new btitem >= firstright, start item == firstright -
- * new chain of duplicates: if this non-leftmost leaf
- * page and parent item < start item then force moving
- * all items to the new page - current page will be
- * "empty" after it.
- */
- {
- if ( !P_LEFTMOST (lpageop) &&
- ( lpageop->btpo_flags & BTP_LEAF ) )
- {
- ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
- bknum, P_HIKEY);
- pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
- if ( _bt_itemcmp (rel, keysz, stack->bts_btitem,
- (BTItem) PageGetItem(page,
- PageGetItemId(page, start)),
- BTLessStrategyNumber) )
- {
- firstright = start;
- shifted = true;
- }
- _bt_relbuf(rel, pbuf, BT_WRITE);
- }
- }
- } /* else - no new chain if start item < firstright one */
- }
-
- /* split the buffer into left and right halves */
- rbuf = _bt_split(rel, buf, firstright);
-
- /* which new page (left half or right half) gets the tuple? */
- if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
- /* left page */
- itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
- itemsz, btitem, afteritem);
- itup_blkno = BufferGetBlockNumber(buf);
- } else {
- /* right page */
- itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
- itemsz, btitem, afteritem);
- itup_blkno = BufferGetBlockNumber(rbuf);
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+ ItemId hitemid;
+ BTItem hitem;
+
+ Assert(!P_RIGHTMOST(lpageop));
+ hitemid = PageGetItemId(page, P_HIKEY);
+ hitem = (BTItem) PageGetItem(page, hitemid);
+ if (maxoff > P_HIKEY &&
+ !_bt_itemcmp(rel, keysz, hitem,
+ (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)),
+ BTEqualStrategyNumber))
+ elog(FATAL, "btree: bad key on the page in the chain of duplicates");
+
+ if (!_bt_skeycmp(rel, keysz, scankey, page, hitemid,
+ BTEqualStrategyNumber))
+ {
+ if (!P_LEFTMOST(lpageop))
+ elog(FATAL, "btree: attempt to insert bad key on the non-leftmost page in the chain of duplicates");
+ if (!_bt_skeycmp(rel, keysz, scankey, page, hitemid,
+ BTLessStrategyNumber))
+ elog(FATAL, "btree: attempt to insert higher key on the leftmost page in the chain of duplicates");
+ if (maxoff > P_HIKEY) /* have duplicate(s) */
+ {
+ firstright = P_FIRSTKEY;
+ do_split = true;
+ }
+ else
+/* "eat" page */
+ {
+ Buffer pbuf;
+ Page ppage;
+
+ itup_blkno = BufferGetBlockNumber(buf);
+ itup_off = PageAddItem(page, (Item) btitem, itemsz,
+ P_FIRSTKEY, LP_USED);
+ if (itup_off == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item");
+ lpageop->btpo_flags &= ~BTP_CHAIN;
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ ppage = BufferGetPage(pbuf);
+ PageIndexTupleDelete(ppage, stack->bts_offset);
+ pfree(stack->bts_btitem);
+ stack->bts_btitem = _bt_formitem(&(btitem->bti_itup));
+ ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
+ itup_blkno, P_HIKEY);
+ _bt_wrtbuf(rel, buf);
+ res = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ keysz, scankey, stack->bts_btitem,
+ NULL);
+ ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+ return (res);
+ }
+ }
+ else
+ {
+ keys_equal = true;
+ if (PageGetFreeSpace(page) < itemsz)
+ do_split = true;
+ }
}
-
- maxoff = PageGetMaxOffsetNumber (page);
- if ( shifted )
- {
- if ( maxoff > P_FIRSTKEY )
- elog (FATAL, "btree: shifted page is not empty");
- lowLeftItem = (BTItem) NULL;
- }
- else
- {
- if ( maxoff < P_FIRSTKEY )
- elog (FATAL, "btree: un-shifted page is empty");
- lowLeftItem = (BTItem) PageGetItem(page,
- PageGetItemId(page, P_FIRSTKEY));
- if ( _bt_itemcmp (rel, keysz, lowLeftItem,
- (BTItem) PageGetItem(page, PageGetItemId(page, P_HIKEY)),
- BTEqualStrategyNumber) )
- lpageop->btpo_flags |= BTP_CHAIN;
+ else if (PageGetFreeSpace(page) < itemsz)
+ do_split = true;
+ else if (PageGetFreeSpace(page) < 3 * itemsz + 2 * sizeof(ItemIdData))
+ {
+ OffsetNumber offnum = (P_RIGHTMOST(lpageop)) ? P_HIKEY : P_FIRSTKEY;
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+ ItemId itid;
+ BTItem previtem,
+ chkitem;
+ Size maxsize;
+ Size currsize;
+
+ itid = PageGetItemId(page, offnum);
+ previtem = (BTItem) PageGetItem(page, itid);
+ maxsize = currsize = (ItemIdGetLength(itid) + sizeof(ItemIdData));
+ for (offnum = OffsetNumberNext(offnum);
+ offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+ {
+ itid = PageGetItemId(page, offnum);
+ chkitem = (BTItem) PageGetItem(page, itid);
+ if (!_bt_itemcmp(rel, keysz, previtem, chkitem,
+ BTEqualStrategyNumber))
+ {
+ if (currsize > maxsize)
+ maxsize = currsize;
+ currsize = 0;
+ previtem = chkitem;
+ }
+ currsize += (ItemIdGetLength(itid) + sizeof(ItemIdData));
+ }
+ if (currsize > maxsize)
+ maxsize = currsize;
+ maxsize += sizeof(PageHeaderData) +
+ DOUBLEALIGN(sizeof(BTPageOpaqueData));
+ if (maxsize >= PageGetPageSize(page) / 2)
+ do_split = true;
}
- /*
- * By here,
- *
- * + our target page has been split;
- * + the original tuple has been inserted;
- * + we have write locks on both the old (left half) and new
- * (right half) buffers, after the split; and
- * + we have the key we want to insert into the parent.
- *
- * Do the parent insertion. We need to hold onto the locks for
- * the child pages until we locate the parent, but we can release
- * them before doing the actual insertion (see Lehman and Yao for
- * the reasoning).
- */
-
- if (stack == (BTStack) NULL) {
-
- /* create a new root node and release the split buffers */
- _bt_newroot(rel, buf, rbuf);
- _bt_relbuf(rel, buf, BT_WRITE);
- _bt_relbuf(rel, rbuf, BT_WRITE);
-
- } else {
- ScanKey newskey;
- InsertIndexResult newres;
- BTItem new_item;
- OffsetNumber upditem_offset = P_HIKEY;
- bool do_update = false;
- bool update_in_place = true;
- bool parent_chained;
-
- /* form a index tuple that points at the new right page */
- rbknum = BufferGetBlockNumber(rbuf);
- rpage = BufferGetPage(rbuf);
- rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
-
- /*
- * By convention, the first entry (1) on every
- * non-rightmost page is the high key for that page. In
- * order to get the lowest key on the new right page, we
- * actually look at its second (2) entry.
- */
-
- if (! P_RIGHTMOST(rpageop))
- {
- ritem = (BTItem) PageGetItem(rpage,
- PageGetItemId(rpage, P_FIRSTKEY));
- if ( _bt_itemcmp (rel, keysz, ritem,
- (BTItem) PageGetItem(rpage,
- PageGetItemId(rpage, P_HIKEY)),
- BTEqualStrategyNumber) )
- rpageop->btpo_flags |= BTP_CHAIN;
- }
- else
- ritem = (BTItem) PageGetItem(rpage,
- PageGetItemId(rpage, P_HIKEY));
-
- /* get a unique btitem for this key */
- new_item = _bt_formitem(&(ritem->bti_itup));
-
- ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
-
- /*
- * Find the parent buffer and get the parent page.
- *
- * Oops - if we were moved right then we need to
- * change stack item! We want to find parent pointing to
- * where we are, right ? - vadim 05/27/97
- */
- ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
- bknum, P_HIKEY);
- pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
- ppage = BufferGetPage(pbuf);
- ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
- parent_chained = (( ppageop->btpo_flags & BTP_CHAIN )) ? true : false;
-
- if ( parent_chained && !left_chained )
- elog (FATAL, "nbtree: unexpected chained parent of unchained page");
-
- /*
- * If the key of new_item is < than the key of the item
- * in the parent page pointing to the left page
- * (stack->bts_btitem), we have to update the latter key;
- * otherwise the keys on the parent page wouldn't be
- * monotonically increasing after we inserted the new
- * pointer to the right page (new_item). This only
- * happens if our left page is the leftmost page and a
- * new minimum key had been inserted before, which is not
- * reflected in the parent page but didn't matter so
- * far. If there are duplicate keys and this new minimum
- * key spills over to our new right page, we get an
- * inconsistency if we don't update the left key in the
- * parent page.
- *
- * Also, new duplicates handling code require us to update
- * parent item if some smaller items left on the left page
- * (which is possible in splitting leftmost page) and
- * current parent item == new_item. - vadim 05/27/97
- */
- if ( _bt_itemcmp (rel, keysz, stack->bts_btitem, new_item,
- BTGreaterStrategyNumber) ||
- ( !shifted &&
- _bt_itemcmp(rel, keysz, stack->bts_btitem,
- new_item, BTEqualStrategyNumber) &&
- _bt_itemcmp(rel, keysz, lowLeftItem,
- new_item, BTLessStrategyNumber) ) )
- {
- do_update = true;
- /*
- * figure out which key is leftmost (if the parent page
- * is rightmost, too, it must be the root)
+ if (do_split)
+ {
+ Buffer rbuf;
+ Page rpage;
+ BTItem ritem;
+ BlockNumber rbknum;
+ BTPageOpaque rpageop;
+ Buffer pbuf;
+ Page ppage;
+ BTPageOpaque ppageop;
+ BlockNumber bknum = BufferGetBlockNumber(buf);
+ BTItem lowLeftItem;
+ OffsetNumber maxoff;
+ bool shifted = false;
+ bool left_chained = (lpageop->btpo_flags & BTP_CHAIN) ? true : false;
+
+ /*
+ * If we have to split leaf page in the chain of duplicates by new
+ * duplicate then we try to look at our right sibling first.
*/
- if(P_RIGHTMOST(ppageop))
- upditem_offset = P_HIKEY;
+ if ((lpageop->btpo_flags & BTP_CHAIN) &&
+ (lpageop->btpo_flags & BTP_LEAF) && keys_equal)
+ {
+ bool use_left = true;
+
+ rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+ if (!P_RIGHTMOST(rpageop)) /* non-rightmost page */
+ { /* If we have the same hikey here then
+ * it's yet another page in chain. */
+ if (_bt_skeycmp(rel, keysz, scankey, rpage,
+ PageGetItemId(rpage, P_HIKEY),
+ BTEqualStrategyNumber))
+ {
+ if (!(rpageop->btpo_flags & BTP_CHAIN))
+ elog(FATAL, "btree: lost page in the chain of duplicates");
+ }
+ else if (_bt_skeycmp(rel, keysz, scankey, rpage,
+ PageGetItemId(rpage, P_HIKEY),
+ BTGreaterStrategyNumber))
+ elog(FATAL, "btree: hikey is out of order");
+ else if (rpageop->btpo_flags & BTP_CHAIN)
+
+ /*
+ * If hikey > scankey then it's last page in chain and
+ * BTP_CHAIN must be OFF
+ */
+ elog(FATAL, "btree: lost last page in the chain of duplicates");
+
+ /* if there is room here then we use this page. */
+ if (PageGetFreeSpace(rpage) > itemsz)
+ use_left = false;
+ }
+ else
+/* rightmost page */
+ {
+ Assert(!(rpageop->btpo_flags & BTP_CHAIN));
+ /* if there is room here then we use this page. */
+ if (PageGetFreeSpace(rpage) > itemsz)
+ use_left = false;
+ }
+ if (!use_left) /* insert on the right page */
+ {
+ _bt_relbuf(rel, buf, BT_WRITE);
+ return (_bt_insertonpg(rel, rbuf, stack, keysz,
+ scankey, btitem, afteritem));
+ }
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+ }
+
+ /*
+ * If after splitting un-chained page we'll got chain of pages
+ * with duplicates then we want to know 1. on which of two pages
+ * new btitem will go (current _bt_findsplitloc is quite bad); 2.
+ * what parent (if there's one) thinking about it (remember about
+ * deletions)
+ */
+ else if (!(lpageop->btpo_flags & BTP_CHAIN))
+ {
+ OffsetNumber start = (P_RIGHTMOST(lpageop)) ? P_HIKEY : P_FIRSTKEY;
+ Size llimit;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ llimit = PageGetPageSize(page) - sizeof(PageHeaderData) -
+ DOUBLEALIGN(sizeof(BTPageOpaqueData))
+ + sizeof(ItemIdData);
+ llimit /= 2;
+ firstright = _bt_findsplitloc(rel, page, start, maxoff, llimit);
+
+ if (_bt_itemcmp(rel, keysz,
+ (BTItem) PageGetItem(page, PageGetItemId(page, start)),
+ (BTItem) PageGetItem(page, PageGetItemId(page, firstright)),
+ BTEqualStrategyNumber))
+ {
+ if (_bt_skeycmp(rel, keysz, scankey, page,
+ PageGetItemId(page, firstright),
+ BTLessStrategyNumber))
+
+ /*
+ * force moving current items to the new page: new
+ * item will go on the current page.
+ */
+ firstright = start;
+ else
+
+ /*
+ * new btitem >= firstright, start item == firstright
+ * - new chain of duplicates: if this non-leftmost
+ * leaf page and parent item < start item then force
+ * moving all items to the new page - current page
+ * will be "empty" after it.
+ */
+ {
+ if (!P_LEFTMOST(lpageop) &&
+ (lpageop->btpo_flags & BTP_LEAF))
+ {
+ ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
+ bknum, P_HIKEY);
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ if (_bt_itemcmp(rel, keysz, stack->bts_btitem,
+ (BTItem) PageGetItem(page,
+ PageGetItemId(page, start)),
+ BTLessStrategyNumber))
+ {
+ firstright = start;
+ shifted = true;
+ }
+ _bt_relbuf(rel, pbuf, BT_WRITE);
+ }
+ }
+ } /* else - no new chain if start item <
+ * firstright one */
+ }
+
+ /* split the buffer into left and right halves */
+ rbuf = _bt_split(rel, buf, firstright);
+
+ /* which new page (left half or right half) gets the tuple? */
+ if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem))
+ {
+ /* left page */
+ itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(buf);
+ }
else
- upditem_offset = P_FIRSTKEY;
- if ( !P_LEFTMOST(lpageop) ||
- stack->bts_offset != upditem_offset )
- elog (FATAL, "btree: items are out of order (leftmost %d, stack %u, update %u)",
- P_LEFTMOST(lpageop), stack->bts_offset, upditem_offset);
- }
-
- if ( do_update )
- {
- if ( shifted )
- elog (FATAL, "btree: attempt to update parent for shifted page");
- /*
- * Try to update in place. If out parent page is chained
- * then we must forse insertion.
+ {
+ /* right page */
+ itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(rbuf);
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ if (shifted)
+ {
+ if (maxoff > P_FIRSTKEY)
+ elog(FATAL, "btree: shifted page is not empty");
+ lowLeftItem = (BTItem) NULL;
+ }
+ else
+ {
+ if (maxoff < P_FIRSTKEY)
+ elog(FATAL, "btree: un-shifted page is empty");
+ lowLeftItem = (BTItem) PageGetItem(page,
+ PageGetItemId(page, P_FIRSTKEY));
+ if (_bt_itemcmp(rel, keysz, lowLeftItem,
+ (BTItem) PageGetItem(page, PageGetItemId(page, P_HIKEY)),
+ BTEqualStrategyNumber))
+ lpageop->btpo_flags |= BTP_CHAIN;
+ }
+
+ /*
+ * By here,
+ *
+ * + our target page has been split; + the original tuple has been
+ * inserted; + we have write locks on both the old (left half)
+ * and new (right half) buffers, after the split; and + we have
+ * the key we want to insert into the parent.
+ *
+ * Do the parent insertion. We need to hold onto the locks for the
+ * child pages until we locate the parent, but we can release them
+ * before doing the actual insertion (see Lehman and Yao for the
+ * reasoning).
*/
- if ( !parent_chained &&
- DOUBLEALIGN (IndexTupleDSize (lowLeftItem->bti_itup)) ==
- DOUBLEALIGN (IndexTupleDSize (stack->bts_btitem->bti_itup)) )
- {
- _bt_updateitem(rel, keysz, pbuf,
- stack->bts_btitem, lowLeftItem);
- _bt_relbuf(rel, buf, BT_WRITE);
- _bt_relbuf(rel, rbuf, BT_WRITE);
+
+ if (stack == (BTStack) NULL)
+ {
+
+ /* create a new root node and release the split buffers */
+ _bt_newroot(rel, buf, rbuf);
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+
}
else
{
- update_in_place = false;
- PageIndexTupleDelete(ppage, upditem_offset);
-
- /*
- * don't write anything out yet--we still have the write
- * lock, and now we call another _bt_insertonpg to
- * insert the correct key.
- * First, make a new item, using the tuple data from
- * lowLeftItem. Point it to the left child.
- * Update it on the stack at the same time.
- */
- pfree(stack->bts_btitem);
- stack->bts_btitem = _bt_formitem(&(lowLeftItem->bti_itup));
- ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
- bknum, P_HIKEY);
-
- /*
- * Unlock the children before doing this
- *
- * Mmm ... I foresee problems here. - vadim 06/10/97
- */
- _bt_relbuf(rel, buf, BT_WRITE);
- _bt_relbuf(rel, rbuf, BT_WRITE);
-
- /*
- * A regular _bt_binsrch should find the right place to
- * put the new entry, since it should be lower than any
- * other key on the page.
- * Therefore set afteritem to NULL.
- */
- newskey = _bt_mkscankey(rel, &(stack->bts_btitem->bti_itup));
- newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
- keysz, newskey, stack->bts_btitem,
- NULL);
-
- pfree(newres);
- pfree(newskey);
-
- /*
- * we have now lost our lock on the parent buffer, and
- * need to get it back.
- */
- pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ ScanKey newskey;
+ InsertIndexResult newres;
+ BTItem new_item;
+ OffsetNumber upditem_offset = P_HIKEY;
+ bool do_update = false;
+ bool update_in_place = true;
+ bool parent_chained;
+
+ /* form a index tuple that points at the new right page */
+ rbknum = BufferGetBlockNumber(rbuf);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /*
+ * By convention, the first entry (1) on every non-rightmost
+ * page is the high key for that page. In order to get the
+ * lowest key on the new right page, we actually look at its
+ * second (2) entry.
+ */
+
+ if (!P_RIGHTMOST(rpageop))
+ {
+ ritem = (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_FIRSTKEY));
+ if (_bt_itemcmp(rel, keysz, ritem,
+ (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_HIKEY)),
+ BTEqualStrategyNumber))
+ rpageop->btpo_flags |= BTP_CHAIN;
+ }
+ else
+ ritem = (BTItem) PageGetItem(rpage,
+ PageGetItemId(rpage, P_HIKEY));
+
+ /* get a unique btitem for this key */
+ new_item = _bt_formitem(&(ritem->bti_itup));
+
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
+
+ /*
+ * Find the parent buffer and get the parent page.
+ *
+ * Oops - if we were moved right then we need to change stack
+ * item! We want to find parent pointing to where we are,
+ * right ? - vadim 05/27/97
+ */
+ ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
+ bknum, P_HIKEY);
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ ppage = BufferGetPage(pbuf);
+ ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
+ parent_chained = ((ppageop->btpo_flags & BTP_CHAIN)) ? true : false;
+
+ if (parent_chained && !left_chained)
+ elog(FATAL, "nbtree: unexpected chained parent of unchained page");
+
+ /*
+ * If the key of new_item is < than the key of the item in the
+ * parent page pointing to the left page (stack->bts_btitem),
+ * we have to update the latter key; otherwise the keys on the
+ * parent page wouldn't be monotonically increasing after we
+ * inserted the new pointer to the right page (new_item). This
+ * only happens if our left page is the leftmost page and a
+ * new minimum key had been inserted before, which is not
+ * reflected in the parent page but didn't matter so far. If
+ * there are duplicate keys and this new minimum key spills
+ * over to our new right page, we get an inconsistency if we
+ * don't update the left key in the parent page.
+ *
+ * Also, new duplicates handling code require us to update parent
+ * item if some smaller items left on the left page (which is
+ * possible in splitting leftmost page) and current parent
+ * item == new_item. - vadim 05/27/97
+ */
+ if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
+ BTGreaterStrategyNumber) ||
+ (!shifted &&
+ _bt_itemcmp(rel, keysz, stack->bts_btitem,
+ new_item, BTEqualStrategyNumber) &&
+ _bt_itemcmp(rel, keysz, lowLeftItem,
+ new_item, BTLessStrategyNumber)))
+ {
+ do_update = true;
+
+ /*
+ * figure out which key is leftmost (if the parent page is
+ * rightmost, too, it must be the root)
+ */
+ if (P_RIGHTMOST(ppageop))
+ upditem_offset = P_HIKEY;
+ else
+ upditem_offset = P_FIRSTKEY;
+ if (!P_LEFTMOST(lpageop) ||
+ stack->bts_offset != upditem_offset)
+ elog(FATAL, "btree: items are out of order (leftmost %d, stack %u, update %u)",
+ P_LEFTMOST(lpageop), stack->bts_offset, upditem_offset);
+ }
+
+ if (do_update)
+ {
+ if (shifted)
+ elog(FATAL, "btree: attempt to update parent for shifted page");
+
+ /*
+ * Try to update in place. If out parent page is chained
+ * then we must forse insertion.
+ */
+ if (!parent_chained &&
+ DOUBLEALIGN(IndexTupleDSize(lowLeftItem->bti_itup)) ==
+ DOUBLEALIGN(IndexTupleDSize(stack->bts_btitem->bti_itup)))
+ {
+ _bt_updateitem(rel, keysz, pbuf,
+ stack->bts_btitem, lowLeftItem);
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+ }
+ else
+ {
+ update_in_place = false;
+ PageIndexTupleDelete(ppage, upditem_offset);
+
+ /*
+ * don't write anything out yet--we still have the
+ * write lock, and now we call another _bt_insertonpg
+ * to insert the correct key. First, make a new item,
+ * using the tuple data from lowLeftItem. Point it to
+ * the left child. Update it on the stack at the same
+ * time.
+ */
+ pfree(stack->bts_btitem);
+ stack->bts_btitem = _bt_formitem(&(lowLeftItem->bti_itup));
+ ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
+ bknum, P_HIKEY);
+
+ /*
+ * Unlock the children before doing this
+ *
+ * Mmm ... I foresee problems here. - vadim 06/10/97
+ */
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+
+ /*
+ * A regular _bt_binsrch should find the right place
+ * to put the new entry, since it should be lower than
+ * any other key on the page. Therefore set afteritem
+ * to NULL.
+ */
+ newskey = _bt_mkscankey(rel, &(stack->bts_btitem->bti_itup));
+ newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ keysz, newskey, stack->bts_btitem,
+ NULL);
+
+ pfree(newres);
+ pfree(newskey);
+
+ /*
+ * we have now lost our lock on the parent buffer, and
+ * need to get it back.
+ */
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ }
+ }
+ else
+ {
+ _bt_relbuf(rel, buf, BT_WRITE);
+ _bt_relbuf(rel, rbuf, BT_WRITE);
+ }
+
+ newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
+
+ afteritem = stack->bts_btitem;
+ if (parent_chained && !update_in_place)
+ {
+ ppage = BufferGetPage(pbuf);
+ ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
+ if (ppageop->btpo_flags & BTP_CHAIN)
+ elog(FATAL, "btree: unexpected BTP_CHAIN flag in parent after update");
+ if (P_RIGHTMOST(ppageop))
+ elog(FATAL, "btree: chained parent is RIGHTMOST after update");
+ maxoff = PageGetMaxOffsetNumber(ppage);
+ if (maxoff != P_FIRSTKEY)
+ elog(FATAL, "btree: FIRSTKEY was unexpected in parent after update");
+ if (_bt_skeycmp(rel, keysz, newskey, ppage,
+ PageGetItemId(ppage, P_FIRSTKEY),
+ BTLessEqualStrategyNumber))
+ elog(FATAL, "btree: parent FIRSTKEY is >= duplicate key after update");
+ if (!_bt_skeycmp(rel, keysz, newskey, ppage,
+ PageGetItemId(ppage, P_HIKEY),
+ BTEqualStrategyNumber))
+ elog(FATAL, "btree: parent HIGHKEY is not equal duplicate key after update");
+ afteritem = (BTItem) NULL;
+ }
+ else if (left_chained && !update_in_place)
+ {
+ ppage = BufferGetPage(pbuf);
+ ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
+ if (!P_RIGHTMOST(ppageop) &&
+ _bt_skeycmp(rel, keysz, newskey, ppage,
+ PageGetItemId(ppage, P_HIKEY),
+ BTGreaterStrategyNumber))
+ afteritem = (BTItem) NULL;
+ }
+ if (afteritem == (BTItem) NULL)
+ {
+ rbuf = _bt_getbuf(rel, ppageop->btpo_next, BT_WRITE);
+ _bt_relbuf(rel, pbuf, BT_WRITE);
+ pbuf = rbuf;
+ }
+
+ newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ keysz, newskey, new_item,
+ afteritem);
+
+ /* be tidy */
+ pfree(newres);
+ pfree(newskey);
+ pfree(new_item);
}
- }
- else
- {
- _bt_relbuf(rel, buf, BT_WRITE);
- _bt_relbuf(rel, rbuf, BT_WRITE);
- }
-
- newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
-
- afteritem = stack->bts_btitem;
- if ( parent_chained && !update_in_place )
- {
- ppage = BufferGetPage(pbuf);
- ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
- if ( ppageop->btpo_flags & BTP_CHAIN )
- elog (FATAL, "btree: unexpected BTP_CHAIN flag in parent after update");
- if ( P_RIGHTMOST (ppageop) )
- elog (FATAL, "btree: chained parent is RIGHTMOST after update");
- maxoff = PageGetMaxOffsetNumber (ppage);
- if ( maxoff != P_FIRSTKEY )
- elog (FATAL, "btree: FIRSTKEY was unexpected in parent after update");
- if ( _bt_skeycmp (rel, keysz, newskey, ppage,
- PageGetItemId(ppage, P_FIRSTKEY),
- BTLessEqualStrategyNumber) )
- elog (FATAL, "btree: parent FIRSTKEY is >= duplicate key after update");
- if ( !_bt_skeycmp (rel, keysz, newskey, ppage,
- PageGetItemId(ppage, P_HIKEY),
- BTEqualStrategyNumber) )
- elog (FATAL, "btree: parent HIGHKEY is not equal duplicate key after update");
- afteritem = (BTItem) NULL;
- }
- else if ( left_chained && !update_in_place )
- {
- ppage = BufferGetPage(pbuf);
- ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
- if ( !P_RIGHTMOST (ppageop) &&
- _bt_skeycmp (rel, keysz, newskey, ppage,
- PageGetItemId(ppage, P_HIKEY),
- BTGreaterStrategyNumber) )
- afteritem = (BTItem) NULL;
- }
- if ( afteritem == (BTItem) NULL)
- {
- rbuf = _bt_getbuf(rel, ppageop->btpo_next, BT_WRITE);
- _bt_relbuf(rel, pbuf, BT_WRITE);
- pbuf = rbuf;
- }
-
- newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
- keysz, newskey, new_item,
- afteritem);
-
- /* be tidy */
- pfree(newres);
- pfree(newskey);
- pfree(new_item);
}
- } else {
- itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
- itemsz, btitem, afteritem);
- itup_blkno = BufferGetBlockNumber(buf);
-
- _bt_relbuf(rel, buf, BT_WRITE);
- }
-
- /* by here, the new tuple is inserted */
- res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
- ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
-
- return (res);
+ else
+ {
+ itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
+ itemsz, btitem, afteritem);
+ itup_blkno = BufferGetBlockNumber(buf);
+
+ _bt_relbuf(rel, buf, BT_WRITE);
+ }
+
+ /* by here, the new tuple is inserted */
+ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+ ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+
+ return (res);
}
/*
- * _bt_split() -- split a page in the btree.
+ * _bt_split() -- split a page in the btree.
*
- * On entry, buf is the page to split, and is write-locked and pinned.
- * Returns the new right sibling of buf, pinned and write-locked. The
- * pin and lock on buf are maintained.
+ * On entry, buf is the page to split, and is write-locked and pinned.
+ * Returns the new right sibling of buf, pinned and write-locked. The
+ * pin and lock on buf are maintained.
*/
-static Buffer
+static Buffer
_bt_split(Relation rel, Buffer buf, OffsetNumber firstright)
{
- Buffer rbuf;
- Page origpage;
- Page leftpage, rightpage;
- BTPageOpaque ropaque, lopaque, oopaque;
- Buffer sbuf;
- Page spage;
- BTPageOpaque sopaque;
- Size itemsz;
- ItemId itemid;
- BTItem item;
- OffsetNumber leftoff, rightoff;
- OffsetNumber start;
- OffsetNumber maxoff;
- OffsetNumber i;
-
- rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
- origpage = BufferGetPage(buf);
- leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
- rightpage = BufferGetPage(rbuf);
-
- _bt_pageinit(rightpage, BufferGetPageSize(rbuf));
- _bt_pageinit(leftpage, BufferGetPageSize(buf));
-
- /* init btree private data */
- oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
- lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
- ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
-
- /* if we're splitting this page, it won't be the root when we're done */
- oopaque->btpo_flags &= ~BTP_ROOT;
- oopaque->btpo_flags &= ~BTP_CHAIN;
- lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
- lopaque->btpo_prev = oopaque->btpo_prev;
- ropaque->btpo_prev = BufferGetBlockNumber(buf);
- lopaque->btpo_next = BufferGetBlockNumber(rbuf);
- ropaque->btpo_next = oopaque->btpo_next;
-
- /*
- * If the page we're splitting is not the rightmost page at its
- * level in the tree, then the first (0) entry on the page is the
- * high key for the page. We need to copy that to the right
- * half. Otherwise (meaning the rightmost page case), we should
- * treat the line pointers beginning at zero as user data.
- *
- * We leave a blank space at the start of the line table for the
- * left page. We'll come back later and fill it in with the high
- * key item we get from the right key.
- */
-
- leftoff = P_FIRSTKEY;
- ropaque->btpo_next = oopaque->btpo_next;
- if (! P_RIGHTMOST(oopaque)) {
- /* splitting a non-rightmost page, start at the first data item */
- start = P_FIRSTKEY;
-
- itemid = PageGetItemId(origpage, P_HIKEY);
- itemsz = ItemIdGetLength(itemid);
- item = (BTItem) PageGetItem(origpage, itemid);
- if ( PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add hikey to the right sibling");
- rightoff = P_FIRSTKEY;
- } else {
- /* splitting a rightmost page, "high key" is the first data item */
- start = P_HIKEY;
-
- /* the new rightmost page will not have a high key */
- rightoff = P_HIKEY;
- }
- maxoff = PageGetMaxOffsetNumber(origpage);
- if ( firstright == InvalidOffsetNumber )
- {
- Size llimit = PageGetFreeSpace(leftpage) / 2;
- firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
- }
-
- for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
- itemid = PageGetItemId(origpage, i);
+ Buffer rbuf;
+ Page origpage;
+ Page leftpage,
+ rightpage;
+ BTPageOpaque ropaque,
+ lopaque,
+ oopaque;
+ Buffer sbuf;
+ Page spage;
+ BTPageOpaque sopaque;
+ Size itemsz;
+ ItemId itemid;
+ BTItem item;
+ OffsetNumber leftoff,
+ rightoff;
+ OffsetNumber start;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+
+ rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ origpage = BufferGetPage(buf);
+ leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
+ rightpage = BufferGetPage(rbuf);
+
+ _bt_pageinit(rightpage, BufferGetPageSize(rbuf));
+ _bt_pageinit(leftpage, BufferGetPageSize(buf));
+
+ /* init btree private data */
+ oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
+
+ /* if we're splitting this page, it won't be the root when we're done */
+ oopaque->btpo_flags &= ~BTP_ROOT;
+ oopaque->btpo_flags &= ~BTP_CHAIN;
+ lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
+ lopaque->btpo_prev = oopaque->btpo_prev;
+ ropaque->btpo_prev = BufferGetBlockNumber(buf);
+ lopaque->btpo_next = BufferGetBlockNumber(rbuf);
+ ropaque->btpo_next = oopaque->btpo_next;
+
+ /*
+ * If the page we're splitting is not the rightmost page at its level
+ * in the tree, then the first (0) entry on the page is the high key
+ * for the page. We need to copy that to the right half. Otherwise
+ * (meaning the rightmost page case), we should treat the line
+ * pointers beginning at zero as user data.
+ *
+ * We leave a blank space at the start of the line table for the left
+ * page. We'll come back later and fill it in with the high key item
+ * we get from the right key.
+ */
+
+ leftoff = P_FIRSTKEY;
+ ropaque->btpo_next = oopaque->btpo_next;
+ if (!P_RIGHTMOST(oopaque))
+ {
+ /* splitting a non-rightmost page, start at the first data item */
+ start = P_FIRSTKEY;
+
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(origpage, itemid);
+ if (PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add hikey to the right sibling");
+ rightoff = P_FIRSTKEY;
+ }
+ else
+ {
+ /* splitting a rightmost page, "high key" is the first data item */
+ start = P_HIKEY;
+
+ /* the new rightmost page will not have a high key */
+ rightoff = P_HIKEY;
+ }
+ maxoff = PageGetMaxOffsetNumber(origpage);
+ if (firstright == InvalidOffsetNumber)
+ {
+ Size llimit = PageGetFreeSpace(leftpage) / 2;
+
+ firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
+ }
+
+ for (i = start; i <= maxoff; i = OffsetNumberNext(i))
+ {
+ itemid = PageGetItemId(origpage, i);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(origpage, itemid);
+
+ /* decide which page to put it on */
+ if (i < firstright)
+ {
+ if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
+ LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item to the left sibling");
+ leftoff = OffsetNumberNext(leftoff);
+ }
+ else
+ {
+ if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+ LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item to the right sibling");
+ rightoff = OffsetNumberNext(rightoff);
+ }
+ }
+
+ /*
+ * Okay, page has been split, high key on right page is correct. Now
+ * set the high key on the left page to be the min key on the right
+ * page.
+ */
+
+ if (P_RIGHTMOST(ropaque))
+ {
+ itemid = PageGetItemId(rightpage, P_HIKEY);
+ }
+ else
+ {
+ itemid = PageGetItemId(rightpage, P_FIRSTKEY);
+ }
itemsz = ItemIdGetLength(itemid);
- item = (BTItem) PageGetItem(origpage, itemid);
-
- /* decide which page to put it on */
- if (i < firstright) {
- if ( PageAddItem(leftpage, (Item) item, itemsz, leftoff,
- LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add item to the left sibling");
- leftoff = OffsetNumberNext(leftoff);
- } else {
- if ( PageAddItem(rightpage, (Item) item, itemsz, rightoff,
- LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add item to the right sibling");
- rightoff = OffsetNumberNext(rightoff);
+ item = (BTItem) PageGetItem(rightpage, itemid);
+
+ /*
+ * We left a hole for the high key on the left page; fill it. The
+ * modal crap is to tell the page manager to put the new item on the
+ * page and not screw around with anything else. Whoever designed
+ * this interface has presumably crawled back into the dung heap they
+ * came from. No one here will admit to it.
+ */
+
+ PageManagerModeSet(OverwritePageManagerMode);
+ if (PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add hikey to the left sibling");
+ PageManagerModeSet(ShufflePageManagerMode);
+
+ /*
+ * By here, the original data page has been split into two new halves,
+ * and these are correct. The algorithm requires that the left page
+ * never move during a split, so we copy the new left page back on top
+ * of the original. Note that this is not a waste of time, since we
+ * also require (in the page management code) that the center of a
+ * page always be clean, and the most efficient way to guarantee this
+ * is just to compact the data by reinserting it into a new left page.
+ */
+
+ PageRestoreTempPage(leftpage, origpage);
+
+ /* write these guys out */
+ _bt_wrtnorelbuf(rel, rbuf);
+ _bt_wrtnorelbuf(rel, buf);
+
+ /*
+ * Finally, we need to grab the right sibling (if any) and fix the
+ * prev pointer there. We are guaranteed that this is deadlock-free
+ * since no other writer will be moving holding a lock on that page
+ * and trying to move left, and all readers release locks on a page
+ * before trying to fetch its neighbors.
+ */
+
+ if (!P_RIGHTMOST(ropaque))
+ {
+ sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
+ spage = BufferGetPage(sbuf);
+ sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
+ sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
+
+ /* write and release the old right sibling */
+ _bt_wrtbuf(rel, sbuf);
}
- }
-
- /*
- * Okay, page has been split, high key on right page is correct. Now
- * set the high key on the left page to be the min key on the right
- * page.
- */
-
- if (P_RIGHTMOST(ropaque)) {
- itemid = PageGetItemId(rightpage, P_HIKEY);
- } else {
- itemid = PageGetItemId(rightpage, P_FIRSTKEY);
- }
- itemsz = ItemIdGetLength(itemid);
- item = (BTItem) PageGetItem(rightpage, itemid);
-
- /*
- * We left a hole for the high key on the left page; fill it. The
- * modal crap is to tell the page manager to put the new item on the
- * page and not screw around with anything else. Whoever designed
- * this interface has presumably crawled back into the dung heap they
- * came from. No one here will admit to it.
- */
-
- PageManagerModeSet(OverwritePageManagerMode);
- if ( PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add hikey to the left sibling");
- PageManagerModeSet(ShufflePageManagerMode);
-
- /*
- * By here, the original data page has been split into two new halves,
- * and these are correct. The algorithm requires that the left page
- * never move during a split, so we copy the new left page back on top
- * of the original. Note that this is not a waste of time, since we
- * also require (in the page management code) that the center of a
- * page always be clean, and the most efficient way to guarantee this
- * is just to compact the data by reinserting it into a new left page.
- */
-
- PageRestoreTempPage(leftpage, origpage);
-
- /* write these guys out */
- _bt_wrtnorelbuf(rel, rbuf);
- _bt_wrtnorelbuf(rel, buf);
-
- /*
- * Finally, we need to grab the right sibling (if any) and fix the
- * prev pointer there. We are guaranteed that this is deadlock-free
- * since no other writer will be moving holding a lock on that page
- * and trying to move left, and all readers release locks on a page
- * before trying to fetch its neighbors.
- */
-
- if (! P_RIGHTMOST(ropaque)) {
- sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
- spage = BufferGetPage(sbuf);
- sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
- sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
-
- /* write and release the old right sibling */
- _bt_wrtbuf(rel, sbuf);
- }
-
- /* split's done */
- return (rbuf);
+
+ /* split's done */
+ return (rbuf);
}
/*
- * _bt_findsplitloc() -- find a safe place to split a page.
+ * _bt_findsplitloc() -- find a safe place to split a page.
*
- * In order to guarantee the proper handling of searches for duplicate
- * keys, the first duplicate in the chain must either be the first
- * item on the page after the split, or the entire chain must be on
- * one of the two pages. That is,
- * [1 2 2 2 3 4 5]
- * must become
- * [1] [2 2 2 3 4 5]
- * or
- * [1 2 2 2] [3 4 5]
- * but not
- * [1 2 2] [2 3 4 5].
- * However,
- * [2 2 2 2 2 3 4]
- * may be split as
- * [2 2 2 2] [2 3 4].
+ * In order to guarantee the proper handling of searches for duplicate
+ * keys, the first duplicate in the chain must either be the first
+ * item on the page after the split, or the entire chain must be on
+ * one of the two pages. That is,
+ * [1 2 2 2 3 4 5]
+ * must become
+ * [1] [2 2 2 3 4 5]
+ * or
+ * [1 2 2 2] [3 4 5]
+ * but not
+ * [1 2 2] [2 3 4 5].
+ * However,
+ * [2 2 2 2 2 3 4]
+ * may be split as
+ * [2 2 2 2] [2 3 4].
*/
-static OffsetNumber
+static OffsetNumber
_bt_findsplitloc(Relation rel,
- Page page,
- OffsetNumber start,
- OffsetNumber maxoff,
- Size llimit)
+ Page page,
+ OffsetNumber start,
+ OffsetNumber maxoff,
+ Size llimit)
{
- OffsetNumber i;
- OffsetNumber saferight;
- ItemId nxtitemid, safeitemid;
- BTItem safeitem, nxtitem;
- Size nbytes;
- int natts;
-
- if ( start >= maxoff )
- elog (FATAL, "btree: cannot split if start (%d) >= maxoff (%d)",
- start, maxoff);
- natts = rel->rd_rel->relnatts;
- saferight = start;
- safeitemid = PageGetItemId(page, saferight);
- nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
- safeitem = (BTItem) PageGetItem(page, safeitemid);
-
- i = OffsetNumberNext(start);
-
- while (nbytes < llimit)
- {
- /* check the next item on the page */
- nxtitemid = PageGetItemId(page, i);
- nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
- nxtitem = (BTItem) PageGetItem(page, nxtitemid);
-
- /*
- * Test against last known safe item:
- * if the tuple we're looking at isn't equal to the last safe
- * one we saw, then it's our new safe tuple.
- */
- if ( !_bt_itemcmp (rel, natts,
- safeitem, nxtitem, BTEqualStrategyNumber) )
+ OffsetNumber i;
+ OffsetNumber saferight;
+ ItemId nxtitemid,
+ safeitemid;
+ BTItem safeitem,
+ nxtitem;
+ Size nbytes;
+ int natts;
+
+ if (start >= maxoff)
+ elog(FATAL, "btree: cannot split if start (%d) >= maxoff (%d)",
+ start, maxoff);
+ natts = rel->rd_rel->relnatts;
+ saferight = start;
+ safeitemid = PageGetItemId(page, saferight);
+ nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
+ safeitem = (BTItem) PageGetItem(page, safeitemid);
+
+ i = OffsetNumberNext(start);
+
+ while (nbytes < llimit)
{
- safeitem = nxtitem;
- saferight = i;
+ /* check the next item on the page */
+ nxtitemid = PageGetItemId(page, i);
+ nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
+ nxtitem = (BTItem) PageGetItem(page, nxtitemid);
+
+ /*
+ * Test against last known safe item: if the tuple we're looking
+ * at isn't equal to the last safe one we saw, then it's our new
+ * safe tuple.
+ */
+ if (!_bt_itemcmp(rel, natts,
+ safeitem, nxtitem, BTEqualStrategyNumber))
+ {
+ safeitem = nxtitem;
+ saferight = i;
+ }
+ if (i < maxoff)
+ i = OffsetNumberNext(i);
+ else
+ break;
}
- if ( i < maxoff )
- i = OffsetNumberNext(i);
- else
- break;
- }
-
- /*
- * If the chain of dups starts at the beginning of the page and extends
- * past the halfway mark, we can split it in the middle.
- */
-
- if (saferight == start)
- saferight = i;
-
- if ( saferight == maxoff && ( maxoff - start ) > 1 )
- saferight = start + ( maxoff - start ) / 2;
-
- return (saferight);
+
+ /*
+ * If the chain of dups starts at the beginning of the page and
+ * extends past the halfway mark, we can split it in the middle.
+ */
+
+ if (saferight == start)
+ saferight = i;
+
+ if (saferight == maxoff && (maxoff - start) > 1)
+ saferight = start + (maxoff - start) / 2;
+
+ return (saferight);
}
/*
- * _bt_newroot() -- Create a new root page for the index.
+ * _bt_newroot() -- Create a new root page for the index.
*
- * We've just split the old root page and need to create a new one.
- * In order to do this, we add a new root page to the file, then lock
- * the metadata page and update it. This is guaranteed to be deadlock-
- * free, because all readers release their locks on the metadata page
- * before trying to lock the root, and all writers lock the root before
- * trying to lock the metadata page. We have a write lock on the old
- * root page, so we have not introduced any cycles into the waits-for
- * graph.
+ * We've just split the old root page and need to create a new one.
+ * In order to do this, we add a new root page to the file, then lock
+ * the metadata page and update it. This is guaranteed to be deadlock-
+ * free, because all readers release their locks on the metadata page
+ * before trying to lock the root, and all writers lock the root before
+ * trying to lock the metadata page. We have a write lock on the old
+ * root page, so we have not introduced any cycles into the waits-for
+ * graph.
*
- * On entry, lbuf (the old root) and rbuf (its new peer) are write-
- * locked. We don't drop the locks in this routine; that's done by
- * the caller. On exit, a new root page exists with entries for the
- * two new children. The new root page is neither pinned nor locked.
+ * On entry, lbuf (the old root) and rbuf (its new peer) are write-
+ * locked. We don't drop the locks in this routine; that's done by
+ * the caller. On exit, a new root page exists with entries for the
+ * two new children. The new root page is neither pinned nor locked.
*/
static void
_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
{
- Buffer rootbuf;
- Page lpage, rpage, rootpage;
- BlockNumber lbkno, rbkno;
- BlockNumber rootbknum;
- BTPageOpaque rootopaque;
- ItemId itemid;
- BTItem item;
- Size itemsz;
- BTItem new_item;
-
- /* get a new root page */
- rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
- rootpage = BufferGetPage(rootbuf);
- _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
-
- /* set btree special data */
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
- rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
- rootopaque->btpo_flags |= BTP_ROOT;
-
- /*
- * Insert the internal tuple pointers.
- */
-
- lbkno = BufferGetBlockNumber(lbuf);
- rbkno = BufferGetBlockNumber(rbuf);
- lpage = BufferGetPage(lbuf);
- rpage = BufferGetPage(rbuf);
-
- /*
- * step over the high key on the left page while building the
- * left page pointer.
- */
- itemid = PageGetItemId(lpage, P_FIRSTKEY);
- itemsz = ItemIdGetLength(itemid);
- item = (BTItem) PageGetItem(lpage, itemid);
- new_item = _bt_formitem(&(item->bti_itup));
- ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
-
- /*
- * insert the left page pointer into the new root page. the root
- * page is the rightmost page on its level so the "high key" item
- * is the first data item.
- */
- if ( PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add leftkey to new root page");
- pfree(new_item);
-
- /*
- * the right page is the rightmost page on the second level, so
- * the "high key" item is the first data item on that page as well.
- */
- itemid = PageGetItemId(rpage, P_HIKEY);
- itemsz = ItemIdGetLength(itemid);
- item = (BTItem) PageGetItem(rpage, itemid);
- new_item = _bt_formitem(&(item->bti_itup));
- ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
-
- /*
- * insert the right page pointer into the new root page.
- */
- if ( PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add rightkey to new root page");
- pfree(new_item);
-
- /* write and let go of the root buffer */
- rootbknum = BufferGetBlockNumber(rootbuf);
- _bt_wrtbuf(rel, rootbuf);
-
- /* update metadata page with new root block number */
- _bt_metaproot(rel, rootbknum, 0);
+ Buffer rootbuf;
+ Page lpage,
+ rpage,
+ rootpage;
+ BlockNumber lbkno,
+ rbkno;
+ BlockNumber rootbknum;
+ BTPageOpaque rootopaque;
+ ItemId itemid;
+ BTItem item;
+ Size itemsz;
+ BTItem new_item;
+
+ /* get a new root page */
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootpage = BufferGetPage(rootbuf);
+ _bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
+
+ /* set btree special data */
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
+ rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
+ rootopaque->btpo_flags |= BTP_ROOT;
+
+ /*
+ * Insert the internal tuple pointers.
+ */
+
+ lbkno = BufferGetBlockNumber(lbuf);
+ rbkno = BufferGetBlockNumber(rbuf);
+ lpage = BufferGetPage(lbuf);
+ rpage = BufferGetPage(rbuf);
+
+ /*
+ * step over the high key on the left page while building the left
+ * page pointer.
+ */
+ itemid = PageGetItemId(lpage, P_FIRSTKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(lpage, itemid);
+ new_item = _bt_formitem(&(item->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
+
+ /*
+ * insert the left page pointer into the new root page. the root page
+ * is the rightmost page on its level so the "high key" item is the
+ * first data item.
+ */
+ if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add leftkey to new root page");
+ pfree(new_item);
+
+ /*
+ * the right page is the rightmost page on the second level, so the
+ * "high key" item is the first data item on that page as well.
+ */
+ itemid = PageGetItemId(rpage, P_HIKEY);
+ itemsz = ItemIdGetLength(itemid);
+ item = (BTItem) PageGetItem(rpage, itemid);
+ new_item = _bt_formitem(&(item->bti_itup));
+ ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
+
+ /*
+ * insert the right page pointer into the new root page.
+ */
+ if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add rightkey to new root page");
+ pfree(new_item);
+
+ /* write and let go of the root buffer */
+ rootbknum = BufferGetBlockNumber(rootbuf);
+ _bt_wrtbuf(rel, rootbuf);
+
+ /* update metadata page with new root block number */
+ _bt_metaproot(rel, rootbknum, 0);
}
/*
- * _bt_pgaddtup() -- add a tuple to a particular page in the index.
+ * _bt_pgaddtup() -- add a tuple to a particular page in the index.
*
- * This routine adds the tuple to the page as requested, and keeps the
- * write lock and reference associated with the page's buffer. It is
- * an error to call pgaddtup() without a write lock and reference. If
- * afteritem is non-null, it's the item that we expect our new item
- * to follow. Otherwise, we do a binary search for the correct place
- * and insert the new item there.
+ * This routine adds the tuple to the page as requested, and keeps the
+ * write lock and reference associated with the page's buffer. It is
+ * an error to call pgaddtup() without a write lock and reference. If
+ * afteritem is non-null, it's the item that we expect our new item
+ * to follow. Otherwise, we do a binary search for the correct place
+ * and insert the new item there.
*/
-static OffsetNumber
+static OffsetNumber
_bt_pgaddtup(Relation rel,
- Buffer buf,
- int keysz,
- ScanKey itup_scankey,
- Size itemsize,
- BTItem btitem,
- BTItem afteritem)
+ Buffer buf,
+ int keysz,
+ ScanKey itup_scankey,
+ Size itemsize,
+ BTItem btitem,
+ BTItem afteritem)
{
- OffsetNumber itup_off;
- OffsetNumber first;
- Page page;
- BTPageOpaque opaque;
- BTItem chkitem;
-
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- if (afteritem == (BTItem) NULL) {
- itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
- } else {
- itup_off = first;
-
- do {
- chkitem =
- (BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
- itup_off = OffsetNumberNext(itup_off);
- } while ( ! BTItemSame (chkitem, afteritem) );
- }
-
- if ( PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add item to the page");
-
- /* write the buffer, but hold our lock */
- _bt_wrtnorelbuf(rel, buf);
-
- return (itup_off);
+ OffsetNumber itup_off;
+ OffsetNumber first;
+ Page page;
+ BTPageOpaque opaque;
+ BTItem chkitem;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (afteritem == (BTItem) NULL)
+ {
+ itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
+ }
+ else
+ {
+ itup_off = first;
+
+ do
+ {
+ chkitem =
+ (BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
+ itup_off = OffsetNumberNext(itup_off);
+ } while (!BTItemSame(chkitem, afteritem));
+ }
+
+ if (PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item to the page");
+
+ /* write the buffer, but hold our lock */
+ _bt_wrtnorelbuf(rel, buf);
+
+ return (itup_off);
}
/*
- * _bt_goesonpg() -- Does a new tuple belong on this page?
+ * _bt_goesonpg() -- Does a new tuple belong on this page?
*
- * This is part of the complexity introduced by allowing duplicate
- * keys into the index. The tuple belongs on this page if:
+ * This is part of the complexity introduced by allowing duplicate
+ * keys into the index. The tuple belongs on this page if:
*
- * + there is no page to the right of this one; or
- * + it is less than the high key on the page; or
- * + the item it is to follow ("afteritem") appears on this
- * page.
+ * + there is no page to the right of this one; or
+ * + it is less than the high key on the page; or
+ * + the item it is to follow ("afteritem") appears on this
+ * page.
*/
-static bool
+static bool
_bt_goesonpg(Relation rel,
- Buffer buf,
- Size keysz,
- ScanKey scankey,
- BTItem afteritem)
+ Buffer buf,
+ Size keysz,
+ ScanKey scankey,
+ BTItem afteritem)
{
- Page page;
- ItemId hikey;
- BTPageOpaque opaque;
- BTItem chkitem;
- OffsetNumber offnum, maxoff;
- bool found;
-
- page = BufferGetPage(buf);
-
- /* no right neighbor? */
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (P_RIGHTMOST(opaque))
- return (true);
-
- /*
- * this is a non-rightmost page, so it must have a high key item.
- *
- * If the scan key is < the high key (the min key on the next page),
- * then it for sure belongs here.
- */
- hikey = PageGetItemId(page, P_HIKEY);
- if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
- return (true);
-
- /*
- * If the scan key is > the high key, then it for sure doesn't belong
- * here.
- */
-
- if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
- return (false);
-
- /*
- * If we have no adjacency information, and the item is equal to the
- * high key on the page (by here it is), then the item does not belong
- * on this page.
- *
- * Now it's not true in all cases. - vadim 06/10/97
- */
-
- if (afteritem == (BTItem) NULL)
- {
- if ( opaque->btpo_flags & BTP_LEAF )
- return (false);
- if ( opaque->btpo_flags & BTP_CHAIN )
- return (true);
- if ( _bt_skeycmp (rel, keysz, scankey, page,
- PageGetItemId(page, P_FIRSTKEY),
- BTEqualStrategyNumber) )
- return (true);
- return (false);
- }
-
- /* damn, have to work for it. i hate that. */
- maxoff = PageGetMaxOffsetNumber(page);
-
- /*
- * Search the entire page for the afteroid. We need to do this, rather
- * than doing a binary search and starting from there, because if the
- * key we're searching for is the leftmost key in the tree at this
- * level, then a binary search will do the wrong thing. Splits are
- * pretty infrequent, so the cost isn't as bad as it could be.
- */
-
- found = false;
- for (offnum = P_FIRSTKEY;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum)) {
- chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
-
- if ( BTItemSame (chkitem, afteritem) ) {
- found = true;
- break;
+ Page page;
+ ItemId hikey;
+ BTPageOpaque opaque;
+ BTItem chkitem;
+ OffsetNumber offnum,
+ maxoff;
+ bool found;
+
+ page = BufferGetPage(buf);
+
+ /* no right neighbor? */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_RIGHTMOST(opaque))
+ return (true);
+
+ /*
+ * this is a non-rightmost page, so it must have a high key item.
+ *
+ * If the scan key is < the high key (the min key on the next page), then
+ * it for sure belongs here.
+ */
+ hikey = PageGetItemId(page, P_HIKEY);
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
+ return (true);
+
+ /*
+ * If the scan key is > the high key, then it for sure doesn't belong
+ * here.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
+ return (false);
+
+ /*
+ * If we have no adjacency information, and the item is equal to the
+ * high key on the page (by here it is), then the item does not belong
+ * on this page.
+ *
+ * Now it's not true in all cases. - vadim 06/10/97
+ */
+
+ if (afteritem == (BTItem) NULL)
+ {
+ if (opaque->btpo_flags & BTP_LEAF)
+ return (false);
+ if (opaque->btpo_flags & BTP_CHAIN)
+ return (true);
+ if (_bt_skeycmp(rel, keysz, scankey, page,
+ PageGetItemId(page, P_FIRSTKEY),
+ BTEqualStrategyNumber))
+ return (true);
+ return (false);
+ }
+
+ /* damn, have to work for it. i hate that. */
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Search the entire page for the afteroid. We need to do this,
+ * rather than doing a binary search and starting from there, because
+ * if the key we're searching for is the leftmost key in the tree at
+ * this level, then a binary search will do the wrong thing. Splits
+ * are pretty infrequent, so the cost isn't as bad as it could be.
+ */
+
+ found = false;
+ for (offnum = P_FIRSTKEY;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+
+ if (BTItemSame(chkitem, afteritem))
+ {
+ found = true;
+ break;
+ }
}
- }
-
- return (found);
+
+ return (found);
}
/*
- * _bt_itemcmp() -- compare item1 to item2 using a requested
- * strategy (<, <=, =, >=, >)
+ * _bt_itemcmp() -- compare item1 to item2 using a requested
+ * strategy (<, <=, =, >=, >)
*
*/
bool
_bt_itemcmp(Relation rel,
- Size keysz,
- BTItem item1,
- BTItem item2,
- StrategyNumber strat)
+ Size keysz,
+ BTItem item1,
+ BTItem item2,
+ StrategyNumber strat)
{
- TupleDesc tupDes;
- IndexTuple indexTuple1, indexTuple2;
- Datum attrDatum1, attrDatum2;
- int i;
- bool isFirstNull, isSecondNull;
- bool compare;
- bool useEqual = false;
-
- if ( strat == BTLessEqualStrategyNumber )
- {
- useEqual = true;
- strat = BTLessStrategyNumber;
- }
- else if ( strat == BTGreaterEqualStrategyNumber )
- {
- useEqual = true;
- strat = BTGreaterStrategyNumber;
- }
-
- tupDes = RelationGetTupleDescriptor(rel);
- indexTuple1 = &(item1->bti_itup);
- indexTuple2 = &(item2->bti_itup);
-
- for (i = 1; i <= keysz; i++) {
- attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isFirstNull);
- attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isSecondNull);
-
- /* see comments about NULLs handling in btbuild */
- if ( isFirstNull ) /* attr in item1 is NULL */
+ TupleDesc tupDes;
+ IndexTuple indexTuple1,
+ indexTuple2;
+ Datum attrDatum1,
+ attrDatum2;
+ int i;
+ bool isFirstNull,
+ isSecondNull;
+ bool compare;
+ bool useEqual = false;
+
+ if (strat == BTLessEqualStrategyNumber)
{
- if ( isSecondNull ) /* attr in item2 is NULL too */
- compare = ( strat == BTEqualStrategyNumber ) ? true : false;
- else
- compare = ( strat == BTGreaterStrategyNumber ) ? true : false;
- }
- else if ( isSecondNull ) /* attr in item1 is NOT_NULL and */
- { /* and attr in item2 is NULL */
- compare = ( strat == BTLessStrategyNumber ) ? true : false;
- }
- else
- {
- compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
+ useEqual = true;
+ strat = BTLessStrategyNumber;
}
-
- if ( compare ) /* true for one of ">, <, =" */
+ else if (strat == BTGreaterEqualStrategyNumber)
{
- if ( strat != BTEqualStrategyNumber )
- return (true);
+ useEqual = true;
+ strat = BTGreaterStrategyNumber;
}
- else /* false for one of ">, <, =" */
+
+ tupDes = RelationGetTupleDescriptor(rel);
+ indexTuple1 = &(item1->bti_itup);
+ indexTuple2 = &(item2->bti_itup);
+
+ for (i = 1; i <= keysz; i++)
{
- if ( strat == BTEqualStrategyNumber )
- return (false);
- /*
- * if original strat was "<=, >=" OR
- * "<, >" but some attribute(s) left
- * - need to test for Equality
- */
- if ( useEqual || i < keysz )
- {
- if ( isFirstNull || isSecondNull )
- compare = ( isFirstNull && isSecondNull ) ? true : false;
- else
- compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber,
- attrDatum1, attrDatum2);
- if ( compare ) /* item1' and item2' attributes are equal */
- continue; /* - try to compare next attributes */
- }
- return (false);
+ attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isFirstNull);
+ attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isSecondNull);
+
+ /* see comments about NULLs handling in btbuild */
+ if (isFirstNull) /* attr in item1 is NULL */
+ {
+ if (isSecondNull) /* attr in item2 is NULL too */
+ compare = (strat == BTEqualStrategyNumber) ? true : false;
+ else
+ compare = (strat == BTGreaterStrategyNumber) ? true : false;
+ }
+ else if (isSecondNull) /* attr in item1 is NOT_NULL and */
+ { /* and attr in item2 is NULL */
+ compare = (strat == BTLessStrategyNumber) ? true : false;
+ }
+ else
+ {
+ compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
+ }
+
+ if (compare) /* true for one of ">, <, =" */
+ {
+ if (strat != BTEqualStrategyNumber)
+ return (true);
+ }
+ else
+/* false for one of ">, <, =" */
+ {
+ if (strat == BTEqualStrategyNumber)
+ return (false);
+
+ /*
+ * if original strat was "<=, >=" OR "<, >" but some
+ * attribute(s) left - need to test for Equality
+ */
+ if (useEqual || i < keysz)
+ {
+ if (isFirstNull || isSecondNull)
+ compare = (isFirstNull && isSecondNull) ? true : false;
+ else
+ compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber,
+ attrDatum1, attrDatum2);
+ if (compare) /* item1' and item2' attributes are equal */
+ continue; /* - try to compare next attributes */
+ }
+ return (false);
+ }
}
- }
- return (true);
+ return (true);
}
/*
- * _bt_updateitem() -- updates the key of the item identified by the
- * oid with the key of newItem (done in place if
- * possible)
+ * _bt_updateitem() -- updates the key of the item identified by the
+ * oid with the key of newItem (done in place if
+ * possible)
*
*/
static void
_bt_updateitem(Relation rel,
- Size keysz,
- Buffer buf,
- BTItem oldItem,
- BTItem newItem)
+ Size keysz,
+ Buffer buf,
+ BTItem oldItem,
+ BTItem newItem)
{
- Page page;
- OffsetNumber maxoff;
- OffsetNumber i;
- ItemPointerData itemPtrData;
- BTItem item;
- IndexTuple oldIndexTuple, newIndexTuple;
- int first;
-
- page = BufferGetPage(buf);
- maxoff = PageGetMaxOffsetNumber(page);
-
- /* locate item on the page */
- first = P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page))
- ? P_HIKEY : P_FIRSTKEY;
- i = first;
- do {
- item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
- i = OffsetNumberNext(i);
- } while (i <= maxoff && ! BTItemSame (item, oldItem));
-
- /* this should never happen (in theory) */
- if ( ! BTItemSame (item, oldItem) ) {
- elog(FATAL, "_bt_getstackbuf was lying!!");
- }
-
- /*
- * It's defined by caller (_bt_insertonpg)
- */
- /*
- if(IndexTupleDSize(newItem->bti_itup) >
- IndexTupleDSize(item->bti_itup)) {
- elog(NOTICE, "trying to overwrite a smaller value with a bigger one in _bt_updateitem");
- elog(WARN, "this is not good.");
- }
- */
-
- oldIndexTuple = &(item->bti_itup);
- newIndexTuple = &(newItem->bti_itup);
+ Page page;
+ OffsetNumber maxoff;
+ OffsetNumber i;
+ ItemPointerData itemPtrData;
+ BTItem item;
+ IndexTuple oldIndexTuple,
+ newIndexTuple;
+ int first;
+
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* locate item on the page */
+ first = P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page))
+ ? P_HIKEY : P_FIRSTKEY;
+ i = first;
+ do
+ {
+ item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
+ i = OffsetNumberNext(i);
+ } while (i <= maxoff && !BTItemSame(item, oldItem));
+
+ /* this should never happen (in theory) */
+ if (!BTItemSame(item, oldItem))
+ {
+ elog(FATAL, "_bt_getstackbuf was lying!!");
+ }
+
+ /*
+ * It's defined by caller (_bt_insertonpg)
+ */
+
+ /*
+ * if(IndexTupleDSize(newItem->bti_itup) >
+ * IndexTupleDSize(item->bti_itup)) { elog(NOTICE, "trying to
+ * overwrite a smaller value with a bigger one in _bt_updateitem");
+ * elog(WARN, "this is not good."); }
+ */
+
+ oldIndexTuple = &(item->bti_itup);
+ newIndexTuple = &(newItem->bti_itup);
/* keep the original item pointer */
- ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
- CopyIndexTuple(newIndexTuple, &oldIndexTuple);
- ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
-
+ ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
+ CopyIndexTuple(newIndexTuple, &oldIndexTuple);
+ ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
+
}
/*
@@ -1409,177 +1460,179 @@ _bt_updateitem(Relation rel,
*
* Rule is simple: NOT_NULL not equal NULL, NULL not_equal NULL too.
*/
-static bool
-_bt_isequal (TupleDesc itupdesc, Page page, OffsetNumber offnum,
- int keysz, ScanKey scankey)
+static bool
+_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
+ int keysz, ScanKey scankey)
{
- Datum datum;
- BTItem btitem;
- IndexTuple itup;
- ScanKey entry;
- AttrNumber attno;
- long result;
- int i;
- bool null;
-
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &(btitem->bti_itup);
-
- for (i = 1; i <= keysz; i++)
- {
- entry = &scankey[i - 1];
- attno = entry->sk_attno;
- Assert (attno == i);
- datum = index_getattr(itup, attno, itupdesc, &null);
-
- /* NULLs are not equal */
- if ( entry->sk_flags & SK_ISNULL || null )
- return (false);
-
- result = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
- entry->sk_argument, datum);
- if (result != 0)
- return (false);
- }
-
- /* by here, the keys are equal */
- return (true);
+ Datum datum;
+ BTItem btitem;
+ IndexTuple itup;
+ ScanKey entry;
+ AttrNumber attno;
+ long result;
+ int i;
+ bool null;
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
+ for (i = 1; i <= keysz; i++)
+ {
+ entry = &scankey[i - 1];
+ attno = entry->sk_attno;
+ Assert(attno == i);
+ datum = index_getattr(itup, attno, itupdesc, &null);
+
+ /* NULLs are not equal */
+ if (entry->sk_flags & SK_ISNULL || null)
+ return (false);
+
+ result = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ entry->sk_argument, datum);
+ if (result != 0)
+ return (false);
+ }
+
+ /* by here, the keys are equal */
+ return (true);
}
#ifdef NOT_USED
/*
- * _bt_shift - insert btitem on the passed page after shifting page
- * to the right in the tree.
+ * _bt_shift - insert btitem on the passed page after shifting page
+ * to the right in the tree.
*
* NOTE: tested for shifting leftmost page only, having btitem < hikey.
*/
-static InsertIndexResult
-_bt_shift (Relation rel, Buffer buf, BTStack stack, int keysz,
- ScanKey scankey, BTItem btitem, BTItem hikey)
+static InsertIndexResult
+_bt_shift(Relation rel, Buffer buf, BTStack stack, int keysz,
+ ScanKey scankey, BTItem btitem, BTItem hikey)
{
- InsertIndexResult res;
- int itemsz;
- Page page;
- BlockNumber bknum;
- BTPageOpaque pageop;
- Buffer rbuf;
- Page rpage;
- BTPageOpaque rpageop;
- Buffer pbuf;
- Page ppage;
- BTPageOpaque ppageop;
- Buffer nbuf;
- Page npage;
- BTPageOpaque npageop;
- BlockNumber nbknum;
- BTItem nitem;
- OffsetNumber afteroff;
-
- btitem = _bt_formitem(&(btitem->bti_itup));
- hikey = _bt_formitem(&(hikey->bti_itup));
-
- page = BufferGetPage(buf);
-
- /* grab new page */
- nbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
- nbknum = BufferGetBlockNumber(nbuf);
- npage = BufferGetPage(nbuf);
- _bt_pageinit(npage, BufferGetPageSize(nbuf));
- npageop = (BTPageOpaque) PageGetSpecialPointer(npage);
-
- /* copy content of the passed page */
- memmove ((char *) npage, (char *) page, BufferGetPageSize(buf));
-
- /* re-init old (passed) page */
- _bt_pageinit(page, BufferGetPageSize(buf));
- pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* init old page opaque */
- pageop->btpo_flags = npageop->btpo_flags; /* restore flags */
- pageop->btpo_flags &= ~BTP_CHAIN;
- if ( _bt_itemcmp (rel, keysz, hikey, btitem, BTEqualStrategyNumber) )
- pageop->btpo_flags |= BTP_CHAIN;
- pageop->btpo_prev = npageop->btpo_prev; /* restore prev */
- pageop->btpo_next = nbknum; /* next points to the new page */
-
- /* init shifted page opaque */
- npageop->btpo_prev = bknum = BufferGetBlockNumber(buf);
-
- /* shifted page is ok, populate old page */
-
- /* add passed hikey */
- itemsz = IndexTupleDSize(hikey->bti_itup)
- + (sizeof(BTItemData) - sizeof(IndexTupleData));
- itemsz = DOUBLEALIGN(itemsz);
- if ( PageAddItem(page, (Item) hikey, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add hikey in _bt_shift");
- pfree (hikey);
-
- /* add btitem */
- itemsz = IndexTupleDSize(btitem->bti_itup)
- + (sizeof(BTItemData) - sizeof(IndexTupleData));
- itemsz = DOUBLEALIGN(itemsz);
- if ( PageAddItem(page, (Item) btitem, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add firstkey in _bt_shift");
- pfree (btitem);
- nitem = (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY));
- btitem = _bt_formitem(&(nitem->bti_itup));
- ItemPointerSet(&(btitem->bti_itup.t_tid), bknum, P_HIKEY);
-
- /* ok, write them out */
- _bt_wrtnorelbuf(rel, nbuf);
- _bt_wrtnorelbuf(rel, buf);
-
- /* fix btpo_prev on right sibling of old page */
- if ( !P_RIGHTMOST (npageop) )
- {
- rbuf = _bt_getbuf(rel, npageop->btpo_next, BT_WRITE);
- rpage = BufferGetPage(rbuf);
- rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
- rpageop->btpo_prev = nbknum;
- _bt_wrtbuf(rel, rbuf);
- }
-
- /* get parent pointing to the old page */
- ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
- bknum, P_HIKEY);
- pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
- ppage = BufferGetPage(pbuf);
- ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
-
- _bt_relbuf(rel, nbuf, BT_WRITE);
- _bt_relbuf(rel, buf, BT_WRITE);
-
- /* re-set parent' pointer - we shifted our page to the right ! */
- nitem = (BTItem) PageGetItem (ppage,
- PageGetItemId (ppage, stack->bts_offset));
- ItemPointerSet(&(nitem->bti_itup.t_tid), nbknum, P_HIKEY);
- ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), nbknum, P_HIKEY);
- _bt_wrtnorelbuf(rel, pbuf);
-
- /*
- * Now we want insert into the parent pointer to our old page. It has to
- * be inserted before the pointer to new page. You may get problems here
- * (in the _bt_goesonpg and/or _bt_pgaddtup), but may be not - I don't
- * know. It works if old page is leftmost (nitem is NULL) and
- * btitem < hikey and it's all what we need currently. - vadim 05/30/97
- */
- nitem = NULL;
- afteroff = P_FIRSTKEY;
- if ( !P_RIGHTMOST (ppageop) )
- afteroff = OffsetNumberNext (afteroff);
- if ( stack->bts_offset >= afteroff )
- {
- afteroff = OffsetNumberPrev (stack->bts_offset);
- nitem = (BTItem) PageGetItem (ppage, PageGetItemId (ppage, afteroff));
- nitem = _bt_formitem(&(nitem->bti_itup));
- }
- res = _bt_insertonpg(rel, pbuf, stack->bts_parent,
- keysz, scankey, btitem, nitem);
- pfree (btitem);
-
- ItemPointerSet(&(res->pointerData), nbknum, P_HIKEY);
-
- return (res);
+ InsertIndexResult res;
+ int itemsz;
+ Page page;
+ BlockNumber bknum;
+ BTPageOpaque pageop;
+ Buffer rbuf;
+ Page rpage;
+ BTPageOpaque rpageop;
+ Buffer pbuf;
+ Page ppage;
+ BTPageOpaque ppageop;
+ Buffer nbuf;
+ Page npage;
+ BTPageOpaque npageop;
+ BlockNumber nbknum;
+ BTItem nitem;
+ OffsetNumber afteroff;
+
+ btitem = _bt_formitem(&(btitem->bti_itup));
+ hikey = _bt_formitem(&(hikey->bti_itup));
+
+ page = BufferGetPage(buf);
+
+ /* grab new page */
+ nbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ nbknum = BufferGetBlockNumber(nbuf);
+ npage = BufferGetPage(nbuf);
+ _bt_pageinit(npage, BufferGetPageSize(nbuf));
+ npageop = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+ /* copy content of the passed page */
+ memmove((char *) npage, (char *) page, BufferGetPageSize(buf));
+
+ /* re-init old (passed) page */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* init old page opaque */
+ pageop->btpo_flags = npageop->btpo_flags; /* restore flags */
+ pageop->btpo_flags &= ~BTP_CHAIN;
+ if (_bt_itemcmp(rel, keysz, hikey, btitem, BTEqualStrategyNumber))
+ pageop->btpo_flags |= BTP_CHAIN;
+ pageop->btpo_prev = npageop->btpo_prev; /* restore prev */
+ pageop->btpo_next = nbknum; /* next points to the new page */
+
+ /* init shifted page opaque */
+ npageop->btpo_prev = bknum = BufferGetBlockNumber(buf);
+
+ /* shifted page is ok, populate old page */
+
+ /* add passed hikey */
+ itemsz = IndexTupleDSize(hikey->bti_itup)
+ + (sizeof(BTItemData) - sizeof(IndexTupleData));
+ itemsz = DOUBLEALIGN(itemsz);
+ if (PageAddItem(page, (Item) hikey, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add hikey in _bt_shift");
+ pfree(hikey);
+
+ /* add btitem */
+ itemsz = IndexTupleDSize(btitem->bti_itup)
+ + (sizeof(BTItemData) - sizeof(IndexTupleData));
+ itemsz = DOUBLEALIGN(itemsz);
+ if (PageAddItem(page, (Item) btitem, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add firstkey in _bt_shift");
+ pfree(btitem);
+ nitem = (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY));
+ btitem = _bt_formitem(&(nitem->bti_itup));
+ ItemPointerSet(&(btitem->bti_itup.t_tid), bknum, P_HIKEY);
+
+ /* ok, write them out */
+ _bt_wrtnorelbuf(rel, nbuf);
+ _bt_wrtnorelbuf(rel, buf);
+
+ /* fix btpo_prev on right sibling of old page */
+ if (!P_RIGHTMOST(npageop))
+ {
+ rbuf = _bt_getbuf(rel, npageop->btpo_next, BT_WRITE);
+ rpage = BufferGetPage(rbuf);
+ rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+ rpageop->btpo_prev = nbknum;
+ _bt_wrtbuf(rel, rbuf);
+ }
+
+ /* get parent pointing to the old page */
+ ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid),
+ bknum, P_HIKEY);
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ ppage = BufferGetPage(pbuf);
+ ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage);
+
+ _bt_relbuf(rel, nbuf, BT_WRITE);
+ _bt_relbuf(rel, buf, BT_WRITE);
+
+ /* re-set parent' pointer - we shifted our page to the right ! */
+ nitem = (BTItem) PageGetItem(ppage,
+ PageGetItemId(ppage, stack->bts_offset));
+ ItemPointerSet(&(nitem->bti_itup.t_tid), nbknum, P_HIKEY);
+ ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), nbknum, P_HIKEY);
+ _bt_wrtnorelbuf(rel, pbuf);
+
+ /*
+ * Now we want insert into the parent pointer to our old page. It has
+ * to be inserted before the pointer to new page. You may get problems
+ * here (in the _bt_goesonpg and/or _bt_pgaddtup), but may be not - I
+ * don't know. It works if old page is leftmost (nitem is NULL) and
+ * btitem < hikey and it's all what we need currently. - vadim
+ * 05/30/97
+ */
+ nitem = NULL;
+ afteroff = P_FIRSTKEY;
+ if (!P_RIGHTMOST(ppageop))
+ afteroff = OffsetNumberNext(afteroff);
+ if (stack->bts_offset >= afteroff)
+ {
+ afteroff = OffsetNumberPrev(stack->bts_offset);
+ nitem = (BTItem) PageGetItem(ppage, PageGetItemId(ppage, afteroff));
+ nitem = _bt_formitem(&(nitem->bti_itup));
+ }
+ res = _bt_insertonpg(rel, pbuf, stack->bts_parent,
+ keysz, scankey, btitem, nitem);
+ pfree(btitem);
+
+ ItemPointerSet(&(res->pointerData), nbknum, P_HIKEY);
+
+ return (res);
}
+
#endif
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 9142c557378..6551af4c17c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1,21 +1,21 @@
/*-------------------------------------------------------------------------
*
* nbtpage.c--
- * BTree-specific page management code for the Postgres btree access
- * method.
+ * BTree-specific page management code for the Postgres btree access
+ * method.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.9 1997/08/19 21:29:36 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.10 1997/09/07 04:38:52 momjian Exp $
*
- * NOTES
- * Postgres btree pages look like ordinary relation pages. The opaque
- * data at high addresses includes pointers to left and right siblings
- * and flag data describing page state. The first page in a btree, page
- * zero, is special -- it stores meta-information describing the tree.
- * Pages one and higher store the actual tree data.
+ * NOTES
+ * Postgres btree pages look like ordinary relation pages. The opaque
+ * data at high addresses includes pointers to left and right siblings
+ * and flag data describing page state. The first page in a btree, page
+ * zero, is special -- it stores meta-information describing the tree.
+ * Pages one and higher store the actual tree data.
*
*-------------------------------------------------------------------------
*/
@@ -31,16 +31,16 @@
#include <storage/lmgr.h>
#ifndef HAVE_MEMMOVE
-# include <regex/utils.h>
+#include <regex/utils.h>
#else
-# include <string.h>
+#include <string.h>
#endif
-static void _bt_setpagelock(Relation rel, BlockNumber blkno, int access);
-static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access);
+static void _bt_setpagelock(Relation rel, BlockNumber blkno, int access);
+static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access);
#define BTREE_METAPAGE 0
-#define BTREE_MAGIC 0x053162
+#define BTREE_MAGIC 0x053162
#ifdef BTREE_VERSION_1
#define BTREE_VERSION 1
@@ -48,546 +48,574 @@ static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access);
#define BTREE_VERSION 0
#endif
-typedef struct BTMetaPageData {
- uint32 btm_magic;
- uint32 btm_version;
- BlockNumber btm_root;
+typedef struct BTMetaPageData
+{
+ uint32 btm_magic;
+ uint32 btm_version;
+ BlockNumber btm_root;
#ifdef BTREE_VERSION_1
- int32 btm_level;
+ int32 btm_level;
#endif
-} BTMetaPageData;
+} BTMetaPageData;
-#define BTPageGetMeta(p) \
- ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
+#define BTPageGetMeta(p) \
+ ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
-extern bool BuildingBtree;
+extern bool BuildingBtree;
/*
- * We use high-concurrency locking on btrees. There are two cases in
- * which we don't do locking. One is when we're building the btree.
- * Since the creating transaction has not committed, no one can see
- * the index, and there's no reason to share locks. The second case
- * is when we're just starting up the database system. We use some
- * special-purpose initialization code in the relation cache manager
- * (see utils/cache/relcache.c) to allow us to do indexed scans on
- * the system catalogs before we'd normally be able to. This happens
- * before the lock table is fully initialized, so we can't use it.
- * Strictly speaking, this violates 2pl, but we don't do 2pl on the
- * system catalogs anyway, so I declare this to be okay.
+ * We use high-concurrency locking on btrees. There are two cases in
+ * which we don't do locking. One is when we're building the btree.
+ * Since the creating transaction has not committed, no one can see
+ * the index, and there's no reason to share locks. The second case
+ * is when we're just starting up the database system. We use some
+ * special-purpose initialization code in the relation cache manager
+ * (see utils/cache/relcache.c) to allow us to do indexed scans on
+ * the system catalogs before we'd normally be able to. This happens
+ * before the lock table is fully initialized, so we can't use it.
+ * Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ * system catalogs anyway, so I declare this to be okay.
*/
-#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
+#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
/*
- * _bt_metapinit() -- Initialize the metadata page of a btree.
+ * _bt_metapinit() -- Initialize the metadata page of a btree.
*/
void
_bt_metapinit(Relation rel)
{
- Buffer buf;
- Page pg;
- int nblocks;
- BTMetaPageData metad;
- BTPageOpaque op;
-
- /* can't be sharing this with anyone, now... */
- if (USELOCKING)
- RelationSetLockForWrite(rel);
-
- if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
- elog(WARN, "Cannot initialize non-empty btree %s",
- RelationGetRelationName(rel));
- }
-
- buf = ReadBuffer(rel, P_NEW);
- pg = BufferGetPage(buf);
- _bt_pageinit(pg, BufferGetPageSize(buf));
-
- metad.btm_magic = BTREE_MAGIC;
- metad.btm_version = BTREE_VERSION;
- metad.btm_root = P_NONE;
+ Buffer buf;
+ Page pg;
+ int nblocks;
+ BTMetaPageData metad;
+ BTPageOpaque op;
+
+ /* can't be sharing this with anyone, now... */
+ if (USELOCKING)
+ RelationSetLockForWrite(rel);
+
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0)
+ {
+ elog(WARN, "Cannot initialize non-empty btree %s",
+ RelationGetRelationName(rel));
+ }
+
+ buf = ReadBuffer(rel, P_NEW);
+ pg = BufferGetPage(buf);
+ _bt_pageinit(pg, BufferGetPageSize(buf));
+
+ metad.btm_magic = BTREE_MAGIC;
+ metad.btm_version = BTREE_VERSION;
+ metad.btm_root = P_NONE;
#ifdef BTREE_VERSION_1
- metad.btm_level = 0;
+ metad.btm_level = 0;
#endif
- memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
-
- op = (BTPageOpaque) PageGetSpecialPointer(pg);
- op->btpo_flags = BTP_META;
-
- WriteBuffer(buf);
-
- /* all done */
- if (USELOCKING)
- RelationUnsetLockForWrite(rel);
+ memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
+
+ op = (BTPageOpaque) PageGetSpecialPointer(pg);
+ op->btpo_flags = BTP_META;
+
+ WriteBuffer(buf);
+
+ /* all done */
+ if (USELOCKING)
+ RelationUnsetLockForWrite(rel);
}
#ifdef NOT_USED
/*
- * _bt_checkmeta() -- Verify that the metadata stored in a btree are
- * reasonable.
+ * _bt_checkmeta() -- Verify that the metadata stored in a btree are
+ * reasonable.
*/
void
_bt_checkmeta(Relation rel)
{
- Buffer metabuf;
- Page metap;
- BTMetaPageData *metad;
- BTPageOpaque op;
- int nblocks;
-
- /* if the relation is empty, this is init time; don't complain */
- if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
- return;
-
- metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
- metap = BufferGetPage(metabuf);
- op = (BTPageOpaque) PageGetSpecialPointer(metap);
- if (!(op->btpo_flags & BTP_META)) {
- elog(WARN, "Invalid metapage for index %s",
- RelationGetRelationName(rel));
- }
- metad = BTPageGetMeta(metap);
-
- if (metad->btm_magic != BTREE_MAGIC) {
- elog(WARN, "Index %s is not a btree",
- RelationGetRelationName(rel));
- }
-
- if (metad->btm_version != BTREE_VERSION) {
- elog(WARN, "Version mismatch on %s: version %d file, version %d code",
- RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION);
- }
-
- _bt_relbuf(rel, metabuf, BT_READ);
+ Buffer metabuf;
+ Page metap;
+ BTMetaPageData *metad;
+ BTPageOpaque op;
+ int nblocks;
+
+ /* if the relation is empty, this is init time; don't complain */
+ if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
+ return;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
+ metap = BufferGetPage(metabuf);
+ op = (BTPageOpaque) PageGetSpecialPointer(metap);
+ if (!(op->btpo_flags & BTP_META))
+ {
+ elog(WARN, "Invalid metapage for index %s",
+ RelationGetRelationName(rel));
+ }
+ metad = BTPageGetMeta(metap);
+
+ if (metad->btm_magic != BTREE_MAGIC)
+ {
+ elog(WARN, "Index %s is not a btree",
+ RelationGetRelationName(rel));
+ }
+
+ if (metad->btm_version != BTREE_VERSION)
+ {
+ elog(WARN, "Version mismatch on %s: version %d file, version %d code",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION);
+ }
+
+ _bt_relbuf(rel, metabuf, BT_READ);
}
+
#endif
/*
- * _bt_getroot() -- Get the root page of the btree.
+ * _bt_getroot() -- Get the root page of the btree.
*
- * Since the root page can move around the btree file, we have to read
- * its location from the metadata page, and then read the root page
- * itself. If no root page exists yet, we have to create one. The
- * standard class of race conditions exists here; I think I covered
- * them all in the Hopi Indian rain dance of lock requests below.
+ * Since the root page can move around the btree file, we have to read
+ * its location from the metadata page, and then read the root page
+ * itself. If no root page exists yet, we have to create one. The
+ * standard class of race conditions exists here; I think I covered
+ * them all in the Hopi Indian rain dance of lock requests below.
*
- * We pass in the access type (BT_READ or BT_WRITE), and return the
- * root page's buffer with the appropriate lock type set. Reference
- * count on the root page gets bumped by ReadBuffer. The metadata
- * page is unlocked and unreferenced by this process when this routine
- * returns.
+ * We pass in the access type (BT_READ or BT_WRITE), and return the
+ * root page's buffer with the appropriate lock type set. Reference
+ * count on the root page gets bumped by ReadBuffer. The metadata
+ * page is unlocked and unreferenced by this process when this routine
+ * returns.
*/
Buffer
_bt_getroot(Relation rel, int access)
{
- Buffer metabuf;
- Page metapg;
- BTPageOpaque metaopaque;
- Buffer rootbuf;
- Page rootpg;
- BTPageOpaque rootopaque;
- BlockNumber rootblkno;
- BTMetaPageData *metad;
-
- metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
- metapg = BufferGetPage(metabuf);
- metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
- Assert(metaopaque->btpo_flags & BTP_META);
- metad = BTPageGetMeta(metapg);
-
- if (metad->btm_magic != BTREE_MAGIC) {
- elog(WARN, "Index %s is not a btree",
- RelationGetRelationName(rel));
- }
-
- if (metad->btm_version != BTREE_VERSION) {
- elog(WARN, "Version mismatch on %s: version %d file, version %d code",
- RelationGetRelationName(rel),
- metad->btm_version, BTREE_VERSION);
- }
-
- /* if no root page initialized yet, do it */
- if (metad->btm_root == P_NONE) {
-
- /* turn our read lock in for a write lock */
- _bt_relbuf(rel, metabuf, BT_READ);
- metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ Buffer metabuf;
+ Page metapg;
+ BTPageOpaque metaopaque;
+ Buffer rootbuf;
+ Page rootpg;
+ BTPageOpaque rootopaque;
+ BlockNumber rootblkno;
+ BTMetaPageData *metad;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
Assert(metaopaque->btpo_flags & BTP_META);
metad = BTPageGetMeta(metapg);
-
- /*
- * Race condition: if someone else initialized the metadata between
- * the time we released the read lock and acquired the write lock,
- * above, we want to avoid doing it again.
- */
-
- if (metad->btm_root == P_NONE) {
-
- /*
- * Get, initialize, write, and leave a lock of the appropriate
- * type on the new root page. Since this is the first page in
- * the tree, it's a leaf.
- */
-
- rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
- rootblkno = BufferGetBlockNumber(rootbuf);
- rootpg = BufferGetPage(rootbuf);
- metad->btm_root = rootblkno;
+
+ if (metad->btm_magic != BTREE_MAGIC)
+ {
+ elog(WARN, "Index %s is not a btree",
+ RelationGetRelationName(rel));
+ }
+
+ if (metad->btm_version != BTREE_VERSION)
+ {
+ elog(WARN, "Version mismatch on %s: version %d file, version %d code",
+ RelationGetRelationName(rel),
+ metad->btm_version, BTREE_VERSION);
+ }
+
+ /* if no root page initialized yet, do it */
+ if (metad->btm_root == P_NONE)
+ {
+
+ /* turn our read lock in for a write lock */
+ _bt_relbuf(rel, metabuf, BT_READ);
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metapg);
+
+ /*
+ * Race condition: if someone else initialized the metadata
+ * between the time we released the read lock and acquired the
+ * write lock, above, we want to avoid doing it again.
+ */
+
+ if (metad->btm_root == P_NONE)
+ {
+
+ /*
+ * Get, initialize, write, and leave a lock of the appropriate
+ * type on the new root page. Since this is the first page in
+ * the tree, it's a leaf.
+ */
+
+ rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
+ rootblkno = BufferGetBlockNumber(rootbuf);
+ rootpg = BufferGetPage(rootbuf);
+ metad->btm_root = rootblkno;
#ifdef BTREE_VERSION_1
- metad->btm_level = 1;
+ metad->btm_level = 1;
#endif
- _bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
- rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
- _bt_wrtnorelbuf(rel, rootbuf);
-
- /* swap write lock for read lock, if appropriate */
- if (access != BT_WRITE) {
- _bt_setpagelock(rel, rootblkno, BT_READ);
- _bt_unsetpagelock(rel, rootblkno, BT_WRITE);
- }
-
- /* okay, metadata is correct */
- _bt_wrtbuf(rel, metabuf);
- } else {
-
- /*
- * Metadata initialized by someone else. In order to guarantee
- * no deadlocks, we have to release the metadata page and start
- * all over again.
- */
-
- _bt_relbuf(rel, metabuf, BT_WRITE);
- return (_bt_getroot(rel, access));
+ _bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+ rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
+ _bt_wrtnorelbuf(rel, rootbuf);
+
+ /* swap write lock for read lock, if appropriate */
+ if (access != BT_WRITE)
+ {
+ _bt_setpagelock(rel, rootblkno, BT_READ);
+ _bt_unsetpagelock(rel, rootblkno, BT_WRITE);
+ }
+
+ /* okay, metadata is correct */
+ _bt_wrtbuf(rel, metabuf);
+ }
+ else
+ {
+
+ /*
+ * Metadata initialized by someone else. In order to
+ * guarantee no deadlocks, we have to release the metadata
+ * page and start all over again.
+ */
+
+ _bt_relbuf(rel, metabuf, BT_WRITE);
+ return (_bt_getroot(rel, access));
+ }
}
- } else {
- rootbuf = _bt_getbuf(rel, metad->btm_root, access);
-
- /* done with the meta page */
- _bt_relbuf(rel, metabuf, BT_READ);
- }
-
- /*
- * Race condition: If the root page split between the time we looked
- * at the metadata page and got the root buffer, then we got the wrong
- * buffer.
- */
-
- rootpg = BufferGetPage(rootbuf);
- rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
- if (!(rootopaque->btpo_flags & BTP_ROOT)) {
-
- /* it happened, try again */
- _bt_relbuf(rel, rootbuf, access);
- return (_bt_getroot(rel, access));
- }
-
- /*
- * By here, we have a correct lock on the root block, its reference
- * count is correct, and we have no lock set on the metadata page.
- * Return the root block.
- */
-
- return (rootbuf);
+ else
+ {
+ rootbuf = _bt_getbuf(rel, metad->btm_root, access);
+
+ /* done with the meta page */
+ _bt_relbuf(rel, metabuf, BT_READ);
+ }
+
+ /*
+ * Race condition: If the root page split between the time we looked
+ * at the metadata page and got the root buffer, then we got the wrong
+ * buffer.
+ */
+
+ rootpg = BufferGetPage(rootbuf);
+ rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
+ if (!(rootopaque->btpo_flags & BTP_ROOT))
+ {
+
+ /* it happened, try again */
+ _bt_relbuf(rel, rootbuf, access);
+ return (_bt_getroot(rel, access));
+ }
+
+ /*
+ * By here, we have a correct lock on the root block, its reference
+ * count is correct, and we have no lock set on the metadata page.
+ * Return the root block.
+ */
+
+ return (rootbuf);
}
/*
- * _bt_getbuf() -- Get a buffer by block number for read or write.
+ * _bt_getbuf() -- Get a buffer by block number for read or write.
*
- * When this routine returns, the appropriate lock is set on the
- * requested buffer its reference count is correct.
+ * When this routine returns, the appropriate lock is set on the
+ * requested buffer its reference count is correct.
*/
Buffer
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
- Buffer buf;
- Page page;
-
- /*
- * If we want a new block, we can't set a lock of the appropriate type
- * until we've instantiated the buffer.
- */
-
- if (blkno != P_NEW) {
- if (access == BT_WRITE)
- _bt_setpagelock(rel, blkno, BT_WRITE);
- else
- _bt_setpagelock(rel, blkno, BT_READ);
-
- buf = ReadBuffer(rel, blkno);
- } else {
- buf = ReadBuffer(rel, blkno);
- blkno = BufferGetBlockNumber(buf);
- page = BufferGetPage(buf);
- _bt_pageinit(page, BufferGetPageSize(buf));
-
- if (access == BT_WRITE)
- _bt_setpagelock(rel, blkno, BT_WRITE);
+ Buffer buf;
+ Page page;
+
+ /*
+ * If we want a new block, we can't set a lock of the appropriate type
+ * until we've instantiated the buffer.
+ */
+
+ if (blkno != P_NEW)
+ {
+ if (access == BT_WRITE)
+ _bt_setpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_setpagelock(rel, blkno, BT_READ);
+
+ buf = ReadBuffer(rel, blkno);
+ }
else
- _bt_setpagelock(rel, blkno, BT_READ);
- }
-
- /* ref count and lock type are correct */
- return (buf);
+ {
+ buf = ReadBuffer(rel, blkno);
+ blkno = BufferGetBlockNumber(buf);
+ page = BufferGetPage(buf);
+ _bt_pageinit(page, BufferGetPageSize(buf));
+
+ if (access == BT_WRITE)
+ _bt_setpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_setpagelock(rel, blkno, BT_READ);
+ }
+
+ /* ref count and lock type are correct */
+ return (buf);
}
/*
- * _bt_relbuf() -- release a locked buffer.
+ * _bt_relbuf() -- release a locked buffer.
*/
void
_bt_relbuf(Relation rel, Buffer buf, int access)
{
- BlockNumber blkno;
-
- blkno = BufferGetBlockNumber(buf);
-
- /* access had better be one of read or write */
- if (access == BT_WRITE)
- _bt_unsetpagelock(rel, blkno, BT_WRITE);
- else
- _bt_unsetpagelock(rel, blkno, BT_READ);
-
- ReleaseBuffer(buf);
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+
+ /* access had better be one of read or write */
+ if (access == BT_WRITE)
+ _bt_unsetpagelock(rel, blkno, BT_WRITE);
+ else
+ _bt_unsetpagelock(rel, blkno, BT_READ);
+
+ ReleaseBuffer(buf);
}
/*
- * _bt_wrtbuf() -- write a btree page to disk.
+ * _bt_wrtbuf() -- write a btree page to disk.
*
- * This routine releases the lock held on the buffer and our reference
- * to it. It is an error to call _bt_wrtbuf() without a write lock
- * or a reference to the buffer.
+ * This routine releases the lock held on the buffer and our reference
+ * to it. It is an error to call _bt_wrtbuf() without a write lock
+ * or a reference to the buffer.
*/
void
_bt_wrtbuf(Relation rel, Buffer buf)
{
- BlockNumber blkno;
-
- blkno = BufferGetBlockNumber(buf);
- WriteBuffer(buf);
- _bt_unsetpagelock(rel, blkno, BT_WRITE);
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteBuffer(buf);
+ _bt_unsetpagelock(rel, blkno, BT_WRITE);
}
/*
- * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
- * our reference or lock.
+ * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
+ * our reference or lock.
*
- * It is an error to call _bt_wrtnorelbuf() without a write lock
- * or a reference to the buffer.
+ * It is an error to call _bt_wrtnorelbuf() without a write lock
+ * or a reference to the buffer.
*/
void
_bt_wrtnorelbuf(Relation rel, Buffer buf)
{
- BlockNumber blkno;
-
- blkno = BufferGetBlockNumber(buf);
- WriteNoReleaseBuffer(buf);
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(buf);
+ WriteNoReleaseBuffer(buf);
}
/*
- * _bt_pageinit() -- Initialize a new page.
+ * _bt_pageinit() -- Initialize a new page.
*/
void
_bt_pageinit(Page page, Size size)
{
- /*
- * Cargo-cult programming -- don't really need this to be zero, but
- * creating new pages is an infrequent occurrence and it makes me feel
- * good when I know they're empty.
- */
-
- memset(page, 0, size);
-
- PageInit(page, size, sizeof(BTPageOpaqueData));
+
+ /*
+ * Cargo-cult programming -- don't really need this to be zero, but
+ * creating new pages is an infrequent occurrence and it makes me feel
+ * good when I know they're empty.
+ */
+
+ memset(page, 0, size);
+
+ PageInit(page, size, sizeof(BTPageOpaqueData));
}
/*
- * _bt_metaproot() -- Change the root page of the btree.
+ * _bt_metaproot() -- Change the root page of the btree.
*
- * Lehman and Yao require that the root page move around in order to
- * guarantee deadlock-free short-term, fine-granularity locking. When
- * we split the root page, we record the new parent in the metadata page
- * for the relation. This routine does the work.
+ * Lehman and Yao require that the root page move around in order to
+ * guarantee deadlock-free short-term, fine-granularity locking. When
+ * we split the root page, we record the new parent in the metadata page
+ * for the relation. This routine does the work.
*
- * No direct preconditions, but if you don't have the a write lock on
- * at least the old root page when you call this, you're making a big
- * mistake. On exit, metapage data is correct and we no longer have
- * a reference to or lock on the metapage.
+ * No direct preconditions, but if you don't have the a write lock on
+ * at least the old root page when you call this, you're making a big
+ * mistake. On exit, metapage data is correct and we no longer have
+ * a reference to or lock on the metapage.
*/
void
_bt_metaproot(Relation rel, BlockNumber rootbknum, int level)
{
- Buffer metabuf;
- Page metap;
- BTPageOpaque metaopaque;
- BTMetaPageData *metad;
-
- metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
- metap = BufferGetPage(metabuf);
- metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
- Assert(metaopaque->btpo_flags & BTP_META);
- metad = BTPageGetMeta(metap);
- metad->btm_root = rootbknum;
+ Buffer metabuf;
+ Page metap;
+ BTPageOpaque metaopaque;
+ BTMetaPageData *metad;
+
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metap = BufferGetPage(metabuf);
+ metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
+ Assert(metaopaque->btpo_flags & BTP_META);
+ metad = BTPageGetMeta(metap);
+ metad->btm_root = rootbknum;
#ifdef BTREE_VERSION_1
- if ( level == 0 ) /* called from _do_insert */
- metad->btm_level += 1;
- else
- metad->btm_level = level; /* called from btsort */
+ if (level == 0) /* called from _do_insert */
+ metad->btm_level += 1;
+ else
+ metad->btm_level = level; /* called from btsort */
#endif
- _bt_wrtbuf(rel, metabuf);
+ _bt_wrtbuf(rel, metabuf);
}
/*
- * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
- * we last looked at in the parent.
+ * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
+ * we last looked at in the parent.
*
- * This is possible because we save a bit image of the last item
- * we looked at in the parent, and the update algorithm guarantees
- * that if items above us in the tree move, they only move right.
+ * This is possible because we save a bit image of the last item
+ * we looked at in the parent, and the update algorithm guarantees
+ * that if items above us in the tree move, they only move right.
*
- * Also, re-set bts_blkno & bts_offset if changed and
- * bts_btitem (it may be changed - see _bt_insertonpg).
+ * Also, re-set bts_blkno & bts_offset if changed and
+ * bts_btitem (it may be changed - see _bt_insertonpg).
*/
Buffer
_bt_getstackbuf(Relation rel, BTStack stack, int access)
{
- Buffer buf;
- BlockNumber blkno;
- OffsetNumber start, offnum, maxoff;
- OffsetNumber i;
- Page page;
- ItemId itemid;
- BTItem item;
- BTPageOpaque opaque;
- BTItem item_save;
- int item_nbytes;
-
- blkno = stack->bts_blkno;
- buf = _bt_getbuf(rel, blkno, access);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
-
- if (maxoff >= stack->bts_offset) {
- itemid = PageGetItemId(page, stack->bts_offset);
- item = (BTItem) PageGetItem(page, itemid);
-
- /* if the item is where we left it, we're done */
- if ( BTItemSame (item, stack->bts_btitem) )
- {
- pfree(stack->bts_btitem);
- item_nbytes = ItemIdGetLength(itemid);
- item_save = (BTItem) palloc(item_nbytes);
- memmove((char *) item_save, (char *) item, item_nbytes);
- stack->bts_btitem = item_save;
- return (buf);
- }
-
- /* if the item has just moved right on this page, we're done */
- for (i = OffsetNumberNext(stack->bts_offset);
- i <= maxoff;
- i = OffsetNumberNext(i)) {
- itemid = PageGetItemId(page, i);
- item = (BTItem) PageGetItem(page, itemid);
-
- /* if the item is where we left it, we're done */
- if ( BTItemSame (item, stack->bts_btitem) )
- {
- stack->bts_offset = i;
- pfree(stack->bts_btitem);
- item_nbytes = ItemIdGetLength(itemid);
- item_save = (BTItem) palloc(item_nbytes);
- memmove((char *) item_save, (char *) item, item_nbytes);
- stack->bts_btitem = item_save;
- return (buf);
- }
- }
- }
-
- /* by here, the item we're looking for moved right at least one page */
- for (;;) {
- blkno = opaque->btpo_next;
- if (P_RIGHTMOST(opaque))
- elog(FATAL, "my bits moved right off the end of the world!");
-
- _bt_relbuf(rel, buf, access);
+ Buffer buf;
+ BlockNumber blkno;
+ OffsetNumber start,
+ offnum,
+ maxoff;
+ OffsetNumber i;
+ Page page;
+ ItemId itemid;
+ BTItem item;
+ BTPageOpaque opaque;
+ BTItem item_save;
+ int item_nbytes;
+
+ blkno = stack->bts_blkno;
buf = _bt_getbuf(rel, blkno, access);
page = BufferGetPage(buf);
- maxoff = PageGetMaxOffsetNumber(page);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* if we have a right sibling, step over the high key */
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- /* see if it's on this page */
- for (offnum = start;
- offnum <= maxoff;
- offnum = OffsetNumberNext(offnum)) {
- itemid = PageGetItemId(page, offnum);
- item = (BTItem) PageGetItem(page, itemid);
- if ( BTItemSame (item, stack->bts_btitem) )
- {
- stack->bts_offset = offnum;
- stack->bts_blkno = blkno;
- pfree(stack->bts_btitem);
- item_nbytes = ItemIdGetLength(itemid);
- item_save = (BTItem) palloc(item_nbytes);
- memmove((char *) item_save, (char *) item, item_nbytes);
- stack->bts_btitem = item_save;
- return (buf);
- }
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (maxoff >= stack->bts_offset)
+ {
+ itemid = PageGetItemId(page, stack->bts_offset);
+ item = (BTItem) PageGetItem(page, itemid);
+
+ /* if the item is where we left it, we're done */
+ if (BTItemSame(item, stack->bts_btitem))
+ {
+ pfree(stack->bts_btitem);
+ item_nbytes = ItemIdGetLength(itemid);
+ item_save = (BTItem) palloc(item_nbytes);
+ memmove((char *) item_save, (char *) item, item_nbytes);
+ stack->bts_btitem = item_save;
+ return (buf);
+ }
+
+ /* if the item has just moved right on this page, we're done */
+ for (i = OffsetNumberNext(stack->bts_offset);
+ i <= maxoff;
+ i = OffsetNumberNext(i))
+ {
+ itemid = PageGetItemId(page, i);
+ item = (BTItem) PageGetItem(page, itemid);
+
+ /* if the item is where we left it, we're done */
+ if (BTItemSame(item, stack->bts_btitem))
+ {
+ stack->bts_offset = i;
+ pfree(stack->bts_btitem);
+ item_nbytes = ItemIdGetLength(itemid);
+ item_save = (BTItem) palloc(item_nbytes);
+ memmove((char *) item_save, (char *) item, item_nbytes);
+ stack->bts_btitem = item_save;
+ return (buf);
+ }
+ }
+ }
+
+ /* by here, the item we're looking for moved right at least one page */
+ for (;;)
+ {
+ blkno = opaque->btpo_next;
+ if (P_RIGHTMOST(opaque))
+ elog(FATAL, "my bits moved right off the end of the world!");
+
+ _bt_relbuf(rel, buf, access);
+ buf = _bt_getbuf(rel, blkno, access);
+ page = BufferGetPage(buf);
+ maxoff = PageGetMaxOffsetNumber(page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* if we have a right sibling, step over the high key */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* see if it's on this page */
+ for (offnum = start;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ itemid = PageGetItemId(page, offnum);
+ item = (BTItem) PageGetItem(page, itemid);
+ if (BTItemSame(item, stack->bts_btitem))
+ {
+ stack->bts_offset = offnum;
+ stack->bts_blkno = blkno;
+ pfree(stack->bts_btitem);
+ item_nbytes = ItemIdGetLength(itemid);
+ item_save = (BTItem) palloc(item_nbytes);
+ memmove((char *) item_save, (char *) item, item_nbytes);
+ stack->bts_btitem = item_save;
+ return (buf);
+ }
+ }
}
- }
}
static void
_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
{
- ItemPointerData iptr;
-
- if (USELOCKING) {
- ItemPointerSet(&iptr, blkno, P_HIKEY);
-
- if (access == BT_WRITE)
- RelationSetSingleWLockPage(rel, &iptr);
- else
- RelationSetSingleRLockPage(rel, &iptr);
- }
+ ItemPointerData iptr;
+
+ if (USELOCKING)
+ {
+ ItemPointerSet(&iptr, blkno, P_HIKEY);
+
+ if (access == BT_WRITE)
+ RelationSetSingleWLockPage(rel, &iptr);
+ else
+ RelationSetSingleRLockPage(rel, &iptr);
+ }
}
static void
_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
{
- ItemPointerData iptr;
-
- if (USELOCKING) {
- ItemPointerSet(&iptr, blkno, P_HIKEY);
-
- if (access == BT_WRITE)
- RelationUnsetSingleWLockPage(rel, &iptr);
- else
- RelationUnsetSingleRLockPage(rel, &iptr);
- }
+ ItemPointerData iptr;
+
+ if (USELOCKING)
+ {
+ ItemPointerSet(&iptr, blkno, P_HIKEY);
+
+ if (access == BT_WRITE)
+ RelationUnsetSingleWLockPage(rel, &iptr);
+ else
+ RelationUnsetSingleRLockPage(rel, &iptr);
+ }
}
void
_bt_pagedel(Relation rel, ItemPointer tid)
{
- Buffer buf;
- Page page;
- BlockNumber blkno;
- OffsetNumber offno;
-
- blkno = ItemPointerGetBlockNumber(tid);
- offno = ItemPointerGetOffsetNumber(tid);
-
- buf = _bt_getbuf(rel, blkno, BT_WRITE);
- page = BufferGetPage(buf);
-
- PageIndexTupleDelete(page, offno);
-
- /* write the buffer and release the lock */
- _bt_wrtbuf(rel, buf);
+ Buffer buf;
+ Page page;
+ BlockNumber blkno;
+ OffsetNumber offno;
+
+ blkno = ItemPointerGetBlockNumber(tid);
+ offno = ItemPointerGetOffsetNumber(tid);
+
+ buf = _bt_getbuf(rel, blkno, BT_WRITE);
+ page = BufferGetPage(buf);
+
+ PageIndexTupleDelete(page, offno);
+
+ /* write the buffer and release the lock */
+ _bt_wrtbuf(rel, buf);
}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b672901f8db..dccbd77b355 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1,17 +1,17 @@
/*-------------------------------------------------------------------------
*
* btree.c--
- * Implementation of Lehman and Yao's btree management algorithm for
- * Postgres.
+ * Implementation of Lehman and Yao's btree management algorithm for
+ * Postgres.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.19 1997/05/05 03:41:17 vadim Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.20 1997/09/07 04:38:54 momjian Exp $
*
* NOTES
- * This file contains only the public interface routines.
+ * This file contains only the public interface routines.
*
*-------------------------------------------------------------------------
*/
@@ -28,546 +28,579 @@
#include <miscadmin.h>
#ifndef HAVE_MEMMOVE
-# include <regex/utils.h>
+#include <regex/utils.h>
#else
-# include <string.h>
+#include <string.h>
#endif
#ifdef BTREE_BUILD_STATS
#include <tcop/tcopprot.h>
-extern int ShowExecutorStats;
+extern int ShowExecutorStats;
+
#endif
-bool BuildingBtree = false; /* see comment in btbuild() */
-bool FastBuild = true; /* use sort/build instead of insertion build */
+bool BuildingBtree = false; /* see comment in btbuild() */
+bool FastBuild = true; /* use sort/build instead of
+ * insertion build */
/*
- * btbuild() -- build a new btree index.
+ * btbuild() -- build a new btree index.
*
- * We use a global variable to record the fact that we're creating
- * a new index. This is used to avoid high-concurrency locking,
- * since the index won't be visible until this transaction commits
- * and since building is guaranteed to be single-threaded.
+ * We use a global variable to record the fact that we're creating
+ * a new index. This is used to avoid high-concurrency locking,
+ * since the index won't be visible until this transaction commits
+ * and since building is guaranteed to be single-threaded.
*/
void
btbuild(Relation heap,
- Relation index,
- int natts,
- AttrNumber *attnum,
- IndexStrategy istrat,
- uint16 pcount,
- Datum *params,
- FuncIndexInfo *finfo,
- PredInfo *predInfo)
+ Relation index,
+ int natts,
+ AttrNumber * attnum,
+ IndexStrategy istrat,
+ uint16 pcount,
+ Datum * params,
+ FuncIndexInfo * finfo,
+ PredInfo * predInfo)
{
- HeapScanDesc hscan;
- Buffer buffer;
- HeapTuple htup;
- IndexTuple itup;
- TupleDesc htupdesc, itupdesc;
- Datum *attdata;
- bool *nulls;
- InsertIndexResult res = 0;
- int nhtups, nitups;
- int i;
- BTItem btitem;
+ HeapScanDesc hscan;
+ Buffer buffer;
+ HeapTuple htup;
+ IndexTuple itup;
+ TupleDesc htupdesc,
+ itupdesc;
+ Datum *attdata;
+ bool *nulls;
+ InsertIndexResult res = 0;
+ int nhtups,
+ nitups;
+ int i;
+ BTItem btitem;
+
#ifndef OMIT_PARTIAL_INDEX
- ExprContext *econtext = (ExprContext *) NULL;
- TupleTable tupleTable = (TupleTable) NULL;
- TupleTableSlot *slot = (TupleTableSlot *) NULL;
-#endif
- Oid hrelid, irelid;
- Node *pred, *oldPred;
- void *spool = (void *) NULL;
- bool isunique;
- bool usefast;
-
- /* note that this is a new btree */
- BuildingBtree = true;
-
- pred = predInfo->pred;
- oldPred = predInfo->oldPred;
-
- /*
- * bootstrap processing does something strange, so don't use
- * sort/build for initial catalog indices. at some point i need
- * to look harder at this. (there is some kind of incremental
- * processing going on there.) -- pma 08/29/95
- */
- usefast = (FastBuild && IsNormalProcessingMode());
+ ExprContext *econtext = (ExprContext *) NULL;
+ TupleTable tupleTable = (TupleTable) NULL;
+ TupleTableSlot *slot = (TupleTableSlot *) NULL;
-#ifdef BTREE_BUILD_STATS
- if ( ShowExecutorStats )
- ResetUsage ();
#endif
+ Oid hrelid,
+ irelid;
+ Node *pred,
+ *oldPred;
+ void *spool = (void *) NULL;
+ bool isunique;
+ bool usefast;
- /* see if index is unique */
- isunique = IndexIsUniqueNoCache(RelationGetRelationId(index));
-
- /* initialize the btree index metadata page (if this is a new index) */
- if (oldPred == NULL)
- _bt_metapinit(index);
-
- /* get tuple descriptors for heap and index relations */
- htupdesc = RelationGetTupleDescriptor(heap);
- itupdesc = RelationGetTupleDescriptor(index);
-
- /* get space for data items that'll appear in the index tuple */
- attdata = (Datum *) palloc(natts * sizeof(Datum));
- nulls = (bool *) palloc(natts * sizeof(bool));
-
- /*
- * If this is a predicate (partial) index, we will need to evaluate the
- * predicate using ExecQual, which requires the current tuple to be in a
- * slot of a TupleTable. In addition, ExecQual must have an ExprContext
- * referring to that slot. Here, we initialize dummy TupleTable and
- * ExprContext objects for this purpose. --Nels, Feb '92
- */
-#ifndef OMIT_PARTIAL_INDEX
- if (pred != NULL || oldPred != NULL) {
- tupleTable = ExecCreateTupleTable(1);
- slot = ExecAllocTableSlot(tupleTable);
- econtext = makeNode(ExprContext);
- FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
+ /* note that this is a new btree */
+ BuildingBtree = true;
+
+ pred = predInfo->pred;
+ oldPred = predInfo->oldPred;
/*
- * we never want to use sort/build if we are extending an
- * existing partial index -- it works by inserting the
- * newly-qualifying tuples into the existing index.
- * (sort/build would overwrite the existing index with one
- * consisting of the newly-qualifying tuples.)
+ * bootstrap processing does something strange, so don't use
+ * sort/build for initial catalog indices. at some point i need to
+ * look harder at this. (there is some kind of incremental processing
+ * going on there.) -- pma 08/29/95
*/
- usefast = false;
- }
-#endif /* OMIT_PARTIAL_INDEX */
-
- /* start a heap scan */
- hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
- htup = heap_getnext(hscan, 0, &buffer);
-
- /* build the index */
- nhtups = nitups = 0;
-
- if (usefast) {
- spool = _bt_spoolinit(index, 7, isunique);
- res = (InsertIndexResult) NULL;
- }
-
- for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
-
- nhtups++;
-
+ usefast = (FastBuild && IsNormalProcessingMode());
+
+#ifdef BTREE_BUILD_STATS
+ if (ShowExecutorStats)
+ ResetUsage();
+#endif
+
+ /* see if index is unique */
+ isunique = IndexIsUniqueNoCache(RelationGetRelationId(index));
+
+ /* initialize the btree index metadata page (if this is a new index) */
+ if (oldPred == NULL)
+ _bt_metapinit(index);
+
+ /* get tuple descriptors for heap and index relations */
+ htupdesc = RelationGetTupleDescriptor(heap);
+ itupdesc = RelationGetTupleDescriptor(index);
+
+ /* get space for data items that'll appear in the index tuple */
+ attdata = (Datum *) palloc(natts * sizeof(Datum));
+ nulls = (bool *) palloc(natts * sizeof(bool));
+
/*
- * If oldPred != NULL, this is an EXTEND INDEX command, so skip
- * this tuple if it was already in the existing partial index
+ * If this is a predicate (partial) index, we will need to evaluate
+ * the predicate using ExecQual, which requires the current tuple to
+ * be in a slot of a TupleTable. In addition, ExecQual must have an
+ * ExprContext referring to that slot. Here, we initialize dummy
+ * TupleTable and ExprContext objects for this purpose. --Nels, Feb
+ * '92
*/
- if (oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+ if (pred != NULL || oldPred != NULL)
+ {
+ tupleTable = ExecCreateTupleTable(1);
+ slot = ExecAllocTableSlot(tupleTable);
+ econtext = makeNode(ExprContext);
+ FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
+
+ /*
+ * we never want to use sort/build if we are extending an existing
+ * partial index -- it works by inserting the newly-qualifying
+ * tuples into the existing index. (sort/build would overwrite the
+ * existing index with one consisting of the newly-qualifying
+ * tuples.)
+ */
+ usefast = false;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+
+ /* start a heap scan */
+ hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+ htup = heap_getnext(hscan, 0, &buffer);
+
+ /* build the index */
+ nhtups = nitups = 0;
+
+ if (usefast)
+ {
+ spool = _bt_spoolinit(index, 7, isunique);
+ res = (InsertIndexResult) NULL;
+ }
+
+ for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer))
+ {
+
+ nhtups++;
+
+ /*
+ * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+ * this tuple if it was already in the existing partial index
+ */
+ if (oldPred != NULL)
+ {
#ifndef OMIT_PARTIAL_INDEX
- /*SetSlotContents(slot, htup);*/
- slot->val = htup;
- if (ExecQual((List*)oldPred, econtext) == true) {
+ /* SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List *) oldPred, econtext) == true)
+ {
+ nitups++;
+ continue;
+ }
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
+ /*
+ * Skip this tuple if it doesn't satisfy the partial-index
+ * predicate
+ */
+ if (pred != NULL)
+ {
+#ifndef OMIT_PARTIAL_INDEX
+ /* SetSlotContents(slot, htup); */
+ slot->val = htup;
+ if (ExecQual((List *) pred, econtext) == false)
+ continue;
+#endif /* OMIT_PARTIAL_INDEX */
+ }
+
nitups++;
- continue;
- }
-#endif /* OMIT_PARTIAL_INDEX */
+
+ /*
+ * For the current heap tuple, extract all the attributes we use
+ * in this index, and note which are null.
+ */
+
+ for (i = 1; i <= natts; i++)
+ {
+ int attoff;
+ bool attnull;
+
+ /*
+ * Offsets are from the start of the tuple, and are
+ * zero-based; indices are one-based. The next call returns i
+ * - 1. That's data hiding for you.
+ */
+
+ attoff = AttrNumberGetAttrOffset(i);
+ attdata[attoff] = GetIndexValue(htup,
+ htupdesc,
+ attoff,
+ attnum,
+ finfo,
+ &attnull,
+ buffer);
+ nulls[attoff] = (attnull ? 'n' : ' ');
+ }
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_formtuple(itupdesc, attdata, nulls);
+
+ /*
+ * If the single index key is null, we don't insert it into the
+ * index. Btrees support scans on <, <=, =, >=, and >. Relational
+ * algebra says that A op B (where op is one of the operators
+ * above) returns null if either A or B is null. This means that
+ * no qualification used in an index scan could ever return true
+ * on a null attribute. It also means that indices can't be used
+ * by ISNULL or NOTNULL scans, but that's an artifact of the
+ * strategy map architecture chosen in 1986, not of the way nulls
+ * are handled here.
+ */
+
+ /*
+ * New comments: NULLs handling. While we can't do NULL
+ * comparison, we can follow simple rule for ordering items on
+ * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE.
+ * Sure, it's just rule for placing/finding items and no more -
+ * keytest'll return FALSE for a = 5 for items having 'a' isNULL.
+ * Look at _bt_skeycmp, _bt_compare and _bt_itemcmp for how it
+ * works. - vadim 03/23/97
+ *
+ * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; }
+ */
+
+ itup->t_tid = htup->t_ctid;
+ btitem = _bt_formitem(itup);
+
+ /*
+ * if we are doing bottom-up btree build, we insert the index into
+ * a spool page for subsequent processing. otherwise, we insert
+ * into the btree.
+ */
+ if (usefast)
+ {
+ _bt_spool(index, btitem, spool);
+ }
+ else
+ {
+ res = _bt_doinsert(index, btitem, isunique, heap);
+ }
+
+ pfree(btitem);
+ pfree(itup);
+ if (res)
+ {
+ pfree(res);
+ }
}
-
- /* Skip this tuple if it doesn't satisfy the partial-index predicate */
- if (pred != NULL) {
+
+ /* okay, all heap tuples are indexed */
+ heap_endscan(hscan);
+
+ if (pred != NULL || oldPred != NULL)
+ {
#ifndef OMIT_PARTIAL_INDEX
- /* SetSlotContents(slot, htup); */
- slot->val = htup;
- if (ExecQual((List*)pred, econtext) == false)
- continue;
-#endif /* OMIT_PARTIAL_INDEX */
+ ExecDestroyTupleTable(tupleTable, true);
+ pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */
}
-
- nitups++;
-
+
/*
- * For the current heap tuple, extract all the attributes
- * we use in this index, and note which are null.
+ * if we are doing bottom-up btree build, we now have a bunch of
+ * sorted runs in the spool pages. finish the build by (1) merging
+ * the runs, (2) inserting the sorted tuples into btree pages and (3)
+ * building the upper levels.
*/
-
- for (i = 1; i <= natts; i++) {
- int attoff;
- bool attnull;
-
- /*
- * Offsets are from the start of the tuple, and are
- * zero-based; indices are one-based. The next call
- * returns i - 1. That's data hiding for you.
- */
-
- attoff = AttrNumberGetAttrOffset(i);
- attdata[attoff] = GetIndexValue(htup,
- htupdesc,
- attoff,
- attnum,
- finfo,
- &attnull,
- buffer);
- nulls[attoff] = (attnull ? 'n' : ' ');
+ if (usefast)
+ {
+ _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
+ _bt_leafbuild(index, spool);
+ _bt_spooldestroy(spool);
}
-
- /* form an index tuple and point it at the heap tuple */
- itup = index_formtuple(itupdesc, attdata, nulls);
-
- /*
- * If the single index key is null, we don't insert it into
- * the index. Btrees support scans on <, <=, =, >=, and >.
- * Relational algebra says that A op B (where op is one of the
- * operators above) returns null if either A or B is null. This
- * means that no qualification used in an index scan could ever
- * return true on a null attribute. It also means that indices
- * can't be used by ISNULL or NOTNULL scans, but that's an
- * artifact of the strategy map architecture chosen in 1986, not
- * of the way nulls are handled here.
- */
- /*
- * New comments: NULLs handling.
- * While we can't do NULL comparison, we can follow simple
- * rule for ordering items on btree pages - NULLs greater
- * NOT_NULLs and NULL = NULL is TRUE. Sure, it's just rule
- * for placing/finding items and no more - keytest'll return
- * FALSE for a = 5 for items having 'a' isNULL.
- * Look at _bt_skeycmp, _bt_compare and _bt_itemcmp for
- * how it works. - vadim 03/23/97
-
- if (itup->t_info & INDEX_NULL_MASK) {
- pfree(itup);
- continue;
+
+#ifdef BTREE_BUILD_STATS
+ if (ShowExecutorStats)
+ {
+ fprintf(stderr, "! BtreeBuild Stats:\n");
+ ShowUsage();
+ ResetUsage();
}
- */
-
- itup->t_tid = htup->t_ctid;
- btitem = _bt_formitem(itup);
+#endif
/*
- * if we are doing bottom-up btree build, we insert the index
- * into a spool page for subsequent processing. otherwise, we
- * insert into the btree.
+ * Since we just counted the tuples in the heap, we update its stats
+ * in pg_class to guarantee that the planner takes advantage of the
+ * index we just created. Finally, only update statistics during
+ * normal index definitions, not for indices on system catalogs
+ * created during bootstrap processing. We must close the relations
+ * before updatings statistics to guarantee that the relcache entries
+ * are flushed when we increment the command counter in UpdateStats().
*/
- if (usefast) {
- _bt_spool(index, btitem, spool);
- } else {
- res = _bt_doinsert(index, btitem, isunique, heap);
+ if (IsNormalProcessingMode())
+ {
+ hrelid = heap->rd_id;
+ irelid = index->rd_id;
+ heap_close(heap);
+ index_close(index);
+ UpdateStats(hrelid, nhtups, true);
+ UpdateStats(irelid, nitups, false);
+ if (oldPred != NULL)
+ {
+ if (nitups == nhtups)
+ pred = NULL;
+ UpdateIndexPredicate(irelid, oldPred, pred);
+ }
}
- pfree(btitem);
- pfree(itup);
- if (res) {
- pfree(res);
- }
- }
-
- /* okay, all heap tuples are indexed */
- heap_endscan(hscan);
-
- if (pred != NULL || oldPred != NULL) {
-#ifndef OMIT_PARTIAL_INDEX
- ExecDestroyTupleTable(tupleTable, true);
- pfree(econtext);
-#endif /* OMIT_PARTIAL_INDEX */
- }
-
- /*
- * if we are doing bottom-up btree build, we now have a bunch of
- * sorted runs in the spool pages. finish the build by (1)
- * merging the runs, (2) inserting the sorted tuples into btree
- * pages and (3) building the upper levels.
- */
- if (usefast) {
- _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */
- _bt_leafbuild(index, spool);
- _bt_spooldestroy(spool);
- }
+ pfree(nulls);
+ pfree(attdata);
-#ifdef BTREE_BUILD_STATS
- if ( ShowExecutorStats )
- {
- fprintf(stderr, "! BtreeBuild Stats:\n");
- ShowUsage ();
- ResetUsage ();
- }
-#endif
-
- /*
- * Since we just counted the tuples in the heap, we update its
- * stats in pg_class to guarantee that the planner takes advantage
- * of the index we just created. Finally, only update statistics
- * during normal index definitions, not for indices on system catalogs
- * created during bootstrap processing. We must close the relations
- * before updatings statistics to guarantee that the relcache entries
- * are flushed when we increment the command counter in UpdateStats().
- */
- if (IsNormalProcessingMode())
- {
- hrelid = heap->rd_id;
- irelid = index->rd_id;
- heap_close(heap);
- index_close(index);
- UpdateStats(hrelid, nhtups, true);
- UpdateStats(irelid, nitups, false);
- if (oldPred != NULL) {
- if (nitups == nhtups) pred = NULL;
- UpdateIndexPredicate(irelid, oldPred, pred);
- }
- }
-
- pfree(nulls);
- pfree(attdata);
-
- /* all done */
- BuildingBtree = false;
+ /* all done */
+ BuildingBtree = false;
}
/*
- * btinsert() -- insert an index tuple into a btree.
+ * btinsert() -- insert an index tuple into a btree.
*
- * Descend the tree recursively, find the appropriate location for our
- * new tuple, put it there, set its unique OID as appropriate, and
- * return an InsertIndexResult to the caller.
+ * Descend the tree recursively, find the appropriate location for our
+ * new tuple, put it there, set its unique OID as appropriate, and
+ * return an InsertIndexResult to the caller.
*/
InsertIndexResult
-btinsert(Relation rel, Datum *datum, char *nulls, ItemPointer ht_ctid, Relation heapRel)
+btinsert(Relation rel, Datum * datum, char *nulls, ItemPointer ht_ctid, Relation heapRel)
{
- BTItem btitem;
- IndexTuple itup;
- InsertIndexResult res;
-
- /* generate an index tuple */
- itup = index_formtuple(RelationGetTupleDescriptor(rel), datum, nulls);
- itup->t_tid = *ht_ctid;
-
- /*
- * See comments in btbuild.
-
- if (itup->t_info & INDEX_NULL_MASK)
- return ((InsertIndexResult) NULL);
- */
-
- btitem = _bt_formitem(itup);
-
- res = _bt_doinsert(rel, btitem,
- IndexIsUnique(RelationGetRelationId(rel)), heapRel);
-
- pfree(btitem);
- pfree(itup);
-
- /* adjust any active scans that will be affected by this insertion */
- _bt_adjscans(rel, &(res->pointerData), BT_INSERT);
-
- return (res);
+ BTItem btitem;
+ IndexTuple itup;
+ InsertIndexResult res;
+
+ /* generate an index tuple */
+ itup = index_formtuple(RelationGetTupleDescriptor(rel), datum, nulls);
+ itup->t_tid = *ht_ctid;
+
+ /*
+ * See comments in btbuild.
+ *
+ * if (itup->t_info & INDEX_NULL_MASK) return ((InsertIndexResult) NULL);
+ */
+
+ btitem = _bt_formitem(itup);
+
+ res = _bt_doinsert(rel, btitem,
+ IndexIsUnique(RelationGetRelationId(rel)), heapRel);
+
+ pfree(btitem);
+ pfree(itup);
+
+ /* adjust any active scans that will be affected by this insertion */
+ _bt_adjscans(rel, &(res->pointerData), BT_INSERT);
+
+ return (res);
}
/*
- * btgettuple() -- Get the next tuple in the scan.
+ * btgettuple() -- Get the next tuple in the scan.
*/
-char *
+char *
btgettuple(IndexScanDesc scan, ScanDirection dir)
{
- RetrieveIndexResult res;
-
- /*
- * If we've already initialized this scan, we can just advance it
- * in the appropriate direction. If we haven't done so yet, we
- * call a routine to get the first item in the scan.
- */
-
- if (ItemPointerIsValid(&(scan->currentItemData)))
- res = _bt_next(scan, dir);
- else
- res = _bt_first(scan, dir);
-
- return ((char *) res);
+ RetrieveIndexResult res;
+
+ /*
+ * If we've already initialized this scan, we can just advance it in
+ * the appropriate direction. If we haven't done so yet, we call a
+ * routine to get the first item in the scan.
+ */
+
+ if (ItemPointerIsValid(&(scan->currentItemData)))
+ res = _bt_next(scan, dir);
+ else
+ res = _bt_first(scan, dir);
+
+ return ((char *) res);
}
/*
- * btbeginscan() -- start a scan on a btree index
+ * btbeginscan() -- start a scan on a btree index
*/
-char *
+char *
btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
{
- IndexScanDesc scan;
-
- /* get the scan */
- scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
-
- /* register scan in case we change pages it's using */
- _bt_regscan(scan);
-
- return ((char *) scan);
+ IndexScanDesc scan;
+
+ /* get the scan */
+ scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+
+ /* register scan in case we change pages it's using */
+ _bt_regscan(scan);
+
+ return ((char *) scan);
}
/*
- * btrescan() -- rescan an index relation
+ * btrescan() -- rescan an index relation
*/
void
btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
{
- ItemPointer iptr;
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
-
- /* we hold a read lock on the current page in the scan */
- if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
- _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
- /* and we hold a read lock on the last marked item in the scan */
- if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
- _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
- so->btso_mrkbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
- if ( so == NULL ) /* if called from btbeginscan */
- {
- so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
- so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
- so->keyData = (ScanKey) NULL;
- if ( scan->numberOfKeys > 0)
- so->keyData = (ScanKey) palloc (scan->numberOfKeys * sizeof(ScanKeyData));
- scan->opaque = so;
- scan->flags = 0x0;
- }
-
- /*
- * Reset the scan keys. Note that keys ordering stuff
- * moved to _bt_first. - vadim 05/05/97
- */
- so->numberOfKeys = scan->numberOfKeys;
- if (scan->numberOfKeys > 0) {
- memmove(scan->keyData,
- scankey,
- scan->numberOfKeys * sizeof(ScanKeyData));
- memmove(so->keyData,
- scankey,
- so->numberOfKeys * sizeof(ScanKeyData));
- }
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* we hold a read lock on the current page in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
+ {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* and we hold a read lock on the last marked item in the scan */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
+ {
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (so == NULL) /* if called from btbeginscan */
+ {
+ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
+ so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
+ so->keyData = (ScanKey) NULL;
+ if (scan->numberOfKeys > 0)
+ so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
+ scan->opaque = so;
+ scan->flags = 0x0;
+ }
+
+ /*
+ * Reset the scan keys. Note that keys ordering stuff moved to
+ * _bt_first. - vadim 05/05/97
+ */
+ so->numberOfKeys = scan->numberOfKeys;
+ if (scan->numberOfKeys > 0)
+ {
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ memmove(so->keyData,
+ scankey,
+ so->numberOfKeys * sizeof(ScanKeyData));
+ }
}
void
btmovescan(IndexScanDesc scan, Datum v)
{
- ItemPointer iptr;
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
-
- /* release any locks we still hold */
- if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
- _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
-/* scan->keyData[0].sk_argument = v; */
- so->keyData[0].sk_argument = v;
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
+ {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+/* scan->keyData[0].sk_argument = v; */
+ so->keyData[0].sk_argument = v;
}
/*
- * btendscan() -- close down a scan
+ * btendscan() -- close down a scan
*/
void
btendscan(IndexScanDesc scan)
{
- ItemPointer iptr;
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
-
- /* release any locks we still hold */
- if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
- if (BufferIsValid(so->btso_curbuf))
- _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
- if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
- if (BufferIsValid(so->btso_mrkbuf))
- _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
- so->btso_mrkbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
- if ( so->keyData != (ScanKey) NULL )
- pfree (so->keyData);
- pfree (so);
-
- _bt_dropscan(scan);
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release any locks we still hold */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
+ {
+ if (BufferIsValid(so->btso_curbuf))
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
+ {
+ if (BufferIsValid(so->btso_mrkbuf))
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ if (so->keyData != (ScanKey) NULL)
+ pfree(so->keyData);
+ pfree(so);
+
+ _bt_dropscan(scan);
}
/*
- * btmarkpos() -- save current scan position
+ * btmarkpos() -- save current scan position
*/
void
btmarkpos(IndexScanDesc scan)
{
- ItemPointer iptr;
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
-
- /* release lock on old marked data, if any */
- if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
- _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
- so->btso_mrkbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
- /* bump lock on currentItemData and copy to currentMarkData */
- if (ItemPointerIsValid(&(scan->currentItemData))) {
- so->btso_mrkbuf = _bt_getbuf(scan->relation,
- BufferGetBlockNumber(so->btso_curbuf),
- BT_READ);
- scan->currentMarkData = scan->currentItemData;
- }
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release lock on old marked data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
+ {
+ _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
+ so->btso_mrkbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentItemData and copy to currentMarkData */
+ if (ItemPointerIsValid(&(scan->currentItemData)))
+ {
+ so->btso_mrkbuf = _bt_getbuf(scan->relation,
+ BufferGetBlockNumber(so->btso_curbuf),
+ BT_READ);
+ scan->currentMarkData = scan->currentItemData;
+ }
}
/*
- * btrestrpos() -- restore scan to last saved position
+ * btrestrpos() -- restore scan to last saved position
*/
void
btrestrpos(IndexScanDesc scan)
{
- ItemPointer iptr;
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
-
- /* release lock on current data, if any */
- if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
- _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(iptr);
- }
-
- /* bump lock on currentMarkData and copy to currentItemData */
- if (ItemPointerIsValid(&(scan->currentMarkData))) {
- so->btso_curbuf = _bt_getbuf(scan->relation,
- BufferGetBlockNumber(so->btso_mrkbuf),
- BT_READ);
-
- scan->currentItemData = scan->currentMarkData;
- }
+ ItemPointer iptr;
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+
+ /* release lock on current data, if any */
+ if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
+ {
+ _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(iptr);
+ }
+
+ /* bump lock on currentMarkData and copy to currentItemData */
+ if (ItemPointerIsValid(&(scan->currentMarkData)))
+ {
+ so->btso_curbuf = _bt_getbuf(scan->relation,
+ BufferGetBlockNumber(so->btso_mrkbuf),
+ BT_READ);
+
+ scan->currentItemData = scan->currentMarkData;
+ }
}
/* stubs */
void
btdelete(Relation rel, ItemPointer tid)
{
- /* adjust any active scans that will be affected by this deletion */
- _bt_adjscans(rel, tid, BT_DELETE);
-
- /* delete the data from the page */
- _bt_pagedel(rel, tid);
+ /* adjust any active scans that will be affected by this deletion */
+ _bt_adjscans(rel, tid, BT_DELETE);
+
+ /* delete the data from the page */
+ _bt_pagedel(rel, tid);
}
diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c
index 5e23fe13d7b..8a2042403ad 100644
--- a/src/backend/access/nbtree/nbtscan.c
+++ b/src/backend/access/nbtree/nbtscan.c
@@ -1,28 +1,28 @@
/*-------------------------------------------------------------------------
*
* btscan.c--
- * manage scans on btrees.
+ * manage scans on btrees.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.7 1997/02/18 17:13:45 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.8 1997/09/07 04:38:57 momjian Exp $
*
*
* NOTES
- * Because we can be doing an index scan on a relation while we update
- * it, we need to avoid missing data that moves around in the index.
- * The routines and global variables in this file guarantee that all
- * scans in the local address space stay correctly positioned. This
- * is all we need to worry about, since write locking guarantees that
- * no one else will be on the same page at the same time as we are.
+ * Because we can be doing an index scan on a relation while we update
+ * it, we need to avoid missing data that moves around in the index.
+ * The routines and global variables in this file guarantee that all
+ * scans in the local address space stay correctly positioned. This
+ * is all we need to worry about, since write locking guarantees that
+ * no one else will be on the same page at the same time as we are.
*
- * The scheme is to manage a list of active scans in the current backend.
- * Whenever we add or remove records from an index, or whenever we
- * split a leaf page, we check the list of active scans to see if any
- * has been affected. A scan is affected only if it is on the same
- * relation, and the same page, as the update.
+ * The scheme is to manage a list of active scans in the current backend.
+ * Whenever we add or remove records from an index, or whenever we
+ * split a leaf page, we check the list of active scans to see if any
+ * has been affected. A scan is affected only if it is on the same
+ * relation, and the same page, as the update.
*
*-------------------------------------------------------------------------
*/
@@ -32,83 +32,87 @@
#include <storage/bufpage.h>
#include <access/nbtree.h>
-typedef struct BTScanListData {
- IndexScanDesc btsl_scan;
- struct BTScanListData *btsl_next;
-} BTScanListData;
+typedef struct BTScanListData
+{
+ IndexScanDesc btsl_scan;
+ struct BTScanListData *btsl_next;
+} BTScanListData;
-typedef BTScanListData *BTScanList;
+typedef BTScanListData *BTScanList;
-static BTScanList BTScans = (BTScanList) NULL;
+static BTScanList BTScans = (BTScanList) NULL;
-static void _bt_scandel(IndexScanDesc scan, int op, BlockNumber blkno, OffsetNumber offno);
-static bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+static void _bt_scandel(IndexScanDesc scan, int op, BlockNumber blkno, OffsetNumber offno);
+static bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
/*
- * _bt_regscan() -- register a new scan.
+ * _bt_regscan() -- register a new scan.
*/
void
_bt_regscan(IndexScanDesc scan)
{
- BTScanList new_el;
-
- new_el = (BTScanList) palloc(sizeof(BTScanListData));
- new_el->btsl_scan = scan;
- new_el->btsl_next = BTScans;
- BTScans = new_el;
+ BTScanList new_el;
+
+ new_el = (BTScanList) palloc(sizeof(BTScanListData));
+ new_el->btsl_scan = scan;
+ new_el->btsl_next = BTScans;
+ BTScans = new_el;
}
/*
- * _bt_dropscan() -- drop a scan from the scan list
+ * _bt_dropscan() -- drop a scan from the scan list
*/
void
_bt_dropscan(IndexScanDesc scan)
{
- BTScanList chk, last;
-
- last = (BTScanList) NULL;
- for (chk = BTScans;
- chk != (BTScanList) NULL && chk->btsl_scan != scan;
- chk = chk->btsl_next) {
- last = chk;
- }
-
- if (chk == (BTScanList) NULL)
- elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
-
- if (last == (BTScanList) NULL)
- BTScans = chk->btsl_next;
- else
- last->btsl_next = chk->btsl_next;
-
- pfree (chk);
+ BTScanList chk,
+ last;
+
+ last = (BTScanList) NULL;
+ for (chk = BTScans;
+ chk != (BTScanList) NULL && chk->btsl_scan != scan;
+ chk = chk->btsl_next)
+ {
+ last = chk;
+ }
+
+ if (chk == (BTScanList) NULL)
+ elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
+
+ if (last == (BTScanList) NULL)
+ BTScans = chk->btsl_next;
+ else
+ last->btsl_next = chk->btsl_next;
+
+ pfree(chk);
}
/*
- * _bt_adjscans() -- adjust all scans in the scan list to compensate
- * for a given deletion or insertion
+ * _bt_adjscans() -- adjust all scans in the scan list to compensate
+ * for a given deletion or insertion
*/
void
_bt_adjscans(Relation rel, ItemPointer tid, int op)
{
- BTScanList l;
- Oid relid;
-
- relid = rel->rd_id;
- for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
- if (relid == l->btsl_scan->relation->rd_id)
- _bt_scandel(l->btsl_scan, op,
- ItemPointerGetBlockNumber(tid),
- ItemPointerGetOffsetNumber(tid));
- }
+ BTScanList l;
+ Oid relid;
+
+ relid = rel->rd_id;
+ for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next)
+ {
+ if (relid == l->btsl_scan->relation->rd_id)
+ _bt_scandel(l->btsl_scan, op,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ }
}
/*
- * _bt_scandel() -- adjust a single scan
+ * _bt_scandel() -- adjust a single scan
*
* because each index page is always maintained as an ordered array of
* index tuples, the index tuples on a given page shift beneath any
- * given scan. an index modification "behind" a scan position (i.e.,
+ * given scan. an index modification "behind" a scan position (i.e.,
* same page, lower or equal offset number) will therefore force us to
* adjust the scan in the following ways:
*
@@ -126,80 +130,85 @@ _bt_adjscans(Relation rel, ItemPointer tid, int op)
static void
_bt_scandel(IndexScanDesc scan, int op, BlockNumber blkno, OffsetNumber offno)
{
- ItemPointer current;
- Buffer buf;
- BTScanOpaque so;
-
- if (!_bt_scantouched(scan, blkno, offno))
- return;
-
- so = (BTScanOpaque) scan->opaque;
- buf = so->btso_curbuf;
-
- current = &(scan->currentItemData);
- if (ItemPointerIsValid(current)
- && ItemPointerGetBlockNumber(current) == blkno
- && ItemPointerGetOffsetNumber(current) >= offno) {
- switch (op) {
- case BT_INSERT:
- _bt_step(scan, &buf, ForwardScanDirection);
- break;
- case BT_DELETE:
- _bt_step(scan, &buf, BackwardScanDirection);
- break;
- default:
- elog(WARN, "_bt_scandel: bad operation '%d'", op);
- /*NOTREACHED*/
+ ItemPointer current;
+ Buffer buf;
+ BTScanOpaque so;
+
+ if (!_bt_scantouched(scan, blkno, offno))
+ return;
+
+ so = (BTScanOpaque) scan->opaque;
+ buf = so->btso_curbuf;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ {
+ switch (op)
+ {
+ case BT_INSERT:
+ _bt_step(scan, &buf, ForwardScanDirection);
+ break;
+ case BT_DELETE:
+ _bt_step(scan, &buf, BackwardScanDirection);
+ break;
+ default:
+ elog(WARN, "_bt_scandel: bad operation '%d'", op);
+ /* NOTREACHED */
+ }
+ so->btso_curbuf = buf;
}
- so->btso_curbuf = buf;
- }
-
- current = &(scan->currentMarkData);
- if (ItemPointerIsValid(current)
- && ItemPointerGetBlockNumber(current) == blkno
- && ItemPointerGetOffsetNumber(current) >= offno) {
- ItemPointerData tmp;
- tmp = *current;
- *current = scan->currentItemData;
- scan->currentItemData = tmp;
- switch (op) {
- case BT_INSERT:
- _bt_step(scan, &buf, ForwardScanDirection);
- break;
- case BT_DELETE:
- _bt_step(scan, &buf, BackwardScanDirection);
- break;
- default:
- elog(WARN, "_bt_scandel: bad operation '%d'", op);
- /*NOTREACHED*/
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ {
+ ItemPointerData tmp;
+
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
+ switch (op)
+ {
+ case BT_INSERT:
+ _bt_step(scan, &buf, ForwardScanDirection);
+ break;
+ case BT_DELETE:
+ _bt_step(scan, &buf, BackwardScanDirection);
+ break;
+ default:
+ elog(WARN, "_bt_scandel: bad operation '%d'", op);
+ /* NOTREACHED */
+ }
+ so->btso_mrkbuf = buf;
+ tmp = *current;
+ *current = scan->currentItemData;
+ scan->currentItemData = tmp;
}
- so->btso_mrkbuf = buf;
- tmp = *current;
- *current = scan->currentItemData;
- scan->currentItemData = tmp;
- }
}
/*
- * _bt_scantouched() -- check to see if a scan is affected by a given
- * change to the index
+ * _bt_scantouched() -- check to see if a scan is affected by a given
+ * change to the index
*/
-static bool
+static bool
_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
{
- ItemPointer current;
-
- current = &(scan->currentItemData);
- if (ItemPointerIsValid(current)
- && ItemPointerGetBlockNumber(current) == blkno
- && ItemPointerGetOffsetNumber(current) >= offno)
- return (true);
-
- current = &(scan->currentMarkData);
- if (ItemPointerIsValid(current)
- && ItemPointerGetBlockNumber(current) == blkno
- && ItemPointerGetOffsetNumber(current) >= offno)
- return (true);
-
- return (false);
+ ItemPointer current;
+
+ current = &(scan->currentItemData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ current = &(scan->currentMarkData);
+ if (ItemPointerIsValid(current)
+ && ItemPointerGetBlockNumber(current) == blkno
+ && ItemPointerGetOffsetNumber(current) >= offno)
+ return (true);
+
+ return (false);
}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 1d1c8072b93..8b1f75b7533 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1,13 +1,13 @@
/*-------------------------------------------------------------------------
*
* btsearch.c--
- * search code for postgres btrees.
+ * search code for postgres btrees.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.23 1997/08/19 21:29:42 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.24 1997/09/07 04:38:58 momjian Exp $
*
*-------------------------------------------------------------------------
*/
@@ -22,1435 +22,1516 @@
#include <catalog/pg_proc.h>
#ifndef HAVE_MEMMOVE
-# include <regex/utils.h>
+#include <regex/utils.h>
#else
-# include <string.h>
+#include <string.h>
#endif
-static BTStack
-_bt_searchr(Relation rel, int keysz, ScanKey scankey,
- Buffer *bufP, BTStack stack_in);
-static OffsetNumber
-_bt_firsteq(Relation rel, TupleDesc itupdesc, Page page,
- Size keysz, ScanKey scankey, OffsetNumber offnum);
-static int
-_bt_compare(Relation rel, TupleDesc itupdesc, Page page,
- int keysz, ScanKey scankey, OffsetNumber offnum);
-static bool
-_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir);
-static RetrieveIndexResult
-_bt_endpoint(IndexScanDesc scan, ScanDirection dir);
+static BTStack
+_bt_searchr(Relation rel, int keysz, ScanKey scankey,
+ Buffer * bufP, BTStack stack_in);
+static OffsetNumber
+_bt_firsteq(Relation rel, TupleDesc itupdesc, Page page,
+ Size keysz, ScanKey scankey, OffsetNumber offnum);
+static int
+_bt_compare(Relation rel, TupleDesc itupdesc, Page page,
+ int keysz, ScanKey scankey, OffsetNumber offnum);
+static bool
+ _bt_twostep(IndexScanDesc scan, Buffer * bufP, ScanDirection dir);
+static RetrieveIndexResult
+ _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
- * _bt_search() -- Search for a scan key in the index.
+ * _bt_search() -- Search for a scan key in the index.
*
- * This routine is actually just a helper that sets things up and
- * calls a recursive-descent search routine on the tree.
+ * This routine is actually just a helper that sets things up and
+ * calls a recursive-descent search routine on the tree.
*/
BTStack
-_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP)
+_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer * bufP)
{
- *bufP = _bt_getroot(rel, BT_READ);
- return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL));
+ *bufP = _bt_getroot(rel, BT_READ);
+ return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL));
}
/*
- * _bt_searchr() -- Search the tree recursively for a particular scankey.
+ * _bt_searchr() -- Search the tree recursively for a particular scankey.
*/
-static BTStack
+static BTStack
_bt_searchr(Relation rel,
- int keysz,
- ScanKey scankey,
- Buffer *bufP,
- BTStack stack_in)
+ int keysz,
+ ScanKey scankey,
+ Buffer * bufP,
+ BTStack stack_in)
{
- BTStack stack;
- OffsetNumber offnum;
- Page page;
- BTPageOpaque opaque;
- BlockNumber par_blkno;
- BlockNumber blkno;
- ItemId itemid;
- BTItem btitem;
- BTItem item_save;
- int item_nbytes;
- IndexTuple itup;
-
- /* if this is a leaf page, we're done */
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (opaque->btpo_flags & BTP_LEAF)
- return (stack_in);
-
- /*
- * Find the appropriate item on the internal page, and get the child
- * page that it points to.
- */
-
- par_blkno = BufferGetBlockNumber(*bufP);
- offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
- itemid = PageGetItemId(page, offnum);
- btitem = (BTItem) PageGetItem(page, itemid);
- itup = &(btitem->bti_itup);
- blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
-
- /*
- * We need to save the bit image of the index entry we chose in the
- * parent page on a stack. In case we split the tree, we'll use this
- * bit image to figure out what our real parent page is, in case the
- * parent splits while we're working lower in the tree. See the paper
- * by Lehman and Yao for how this is detected and handled. (We use
- * unique OIDs to disambiguate duplicate keys in the index -- Lehman
- * and Yao disallow duplicate keys).
- */
-
- item_nbytes = ItemIdGetLength(itemid);
- item_save = (BTItem) palloc(item_nbytes);
- memmove((char *) item_save, (char *) btitem, item_nbytes);
- stack = (BTStack) palloc(sizeof(BTStackData));
- stack->bts_blkno = par_blkno;
- stack->bts_offset = offnum;
- stack->bts_btitem = item_save;
- stack->bts_parent = stack_in;
-
- /* drop the read lock on the parent page and acquire one on the child */
- _bt_relbuf(rel, *bufP, BT_READ);
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
-
- /*
- * Race -- the page we just grabbed may have split since we read its
- * pointer in the parent. If it has, we may need to move right to its
- * new sibling. Do that.
- */
-
- *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
-
- /* okay, all set to move down a level */
- return (_bt_searchr(rel, keysz, scankey, bufP, stack));
+ BTStack stack;
+ OffsetNumber offnum;
+ Page page;
+ BTPageOpaque opaque;
+ BlockNumber par_blkno;
+ BlockNumber blkno;
+ ItemId itemid;
+ BTItem btitem;
+ BTItem item_save;
+ int item_nbytes;
+ IndexTuple itup;
+
+ /* if this is a leaf page, we're done */
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (opaque->btpo_flags & BTP_LEAF)
+ return (stack_in);
+
+ /*
+ * Find the appropriate item on the internal page, and get the child
+ * page that it points to.
+ */
+
+ par_blkno = BufferGetBlockNumber(*bufP);
+ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT);
+ itemid = PageGetItemId(page, offnum);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ itup = &(btitem->bti_itup);
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
+ /*
+ * We need to save the bit image of the index entry we chose in the
+ * parent page on a stack. In case we split the tree, we'll use this
+ * bit image to figure out what our real parent page is, in case the
+ * parent splits while we're working lower in the tree. See the paper
+ * by Lehman and Yao for how this is detected and handled. (We use
+ * unique OIDs to disambiguate duplicate keys in the index -- Lehman
+ * and Yao disallow duplicate keys).
+ */
+
+ item_nbytes = ItemIdGetLength(itemid);
+ item_save = (BTItem) palloc(item_nbytes);
+ memmove((char *) item_save, (char *) btitem, item_nbytes);
+ stack = (BTStack) palloc(sizeof(BTStackData));
+ stack->bts_blkno = par_blkno;
+ stack->bts_offset = offnum;
+ stack->bts_btitem = item_save;
+ stack->bts_parent = stack_in;
+
+ /* drop the read lock on the parent page and acquire one on the child */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+
+ /*
+ * Race -- the page we just grabbed may have split since we read its
+ * pointer in the parent. If it has, we may need to move right to its
+ * new sibling. Do that.
+ */
+
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);
+
+ /* okay, all set to move down a level */
+ return (_bt_searchr(rel, keysz, scankey, bufP, stack));
}
/*
- * _bt_moveright() -- move right in the btree if necessary.
+ * _bt_moveright() -- move right in the btree if necessary.
*
- * When we drop and reacquire a pointer to a page, it is possible that
- * the page has changed in the meanwhile. If this happens, we're
- * guaranteed that the page has "split right" -- that is, that any
- * data that appeared on the page originally is either on the page
- * or strictly to the right of it.
+ * When we drop and reacquire a pointer to a page, it is possible that
+ * the page has changed in the meanwhile. If this happens, we're
+ * guaranteed that the page has "split right" -- that is, that any
+ * data that appeared on the page originally is either on the page
+ * or strictly to the right of it.
*
- * This routine decides whether or not we need to move right in the
- * tree by examining the high key entry on the page. If that entry
- * is strictly less than one we expect to be on the page, then our
- * picture of the page is incorrect and we need to move right.
+ * This routine decides whether or not we need to move right in the
+ * tree by examining the high key entry on the page. If that entry
+ * is strictly less than one we expect to be on the page, then our
+ * picture of the page is incorrect and we need to move right.
*
- * On entry, we have the buffer pinned and a lock of the proper type.
- * If we move right, we release the buffer and lock and acquire the
- * same on the right sibling.
+ * On entry, we have the buffer pinned and a lock of the proper type.
+ * If we move right, we release the buffer and lock and acquire the
+ * same on the right sibling.
*/
Buffer
_bt_moveright(Relation rel,
- Buffer buf,
- int keysz,
- ScanKey scankey,
- int access)
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ int access)
{
- Page page;
- BTPageOpaque opaque;
- ItemId hikey;
- BlockNumber rblkno;
- int natts = rel->rd_rel->relnatts;
-
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* if we're on a rightmost page, we don't need to move right */
- if (P_RIGHTMOST(opaque))
- return (buf);
-
- /* by convention, item 0 on non-rightmost pages is the high key */
- hikey = PageGetItemId(page, P_HIKEY);
-
- /*
- * If the scan key that brought us to this page is >= the high key
- * stored on the page, then the page has split and we need to move
- * right.
- */
-
- if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
- BTGreaterEqualStrategyNumber))
- {
- /* move right as long as we need to */
- do
+ Page page;
+ BTPageOpaque opaque;
+ ItemId hikey;
+ BlockNumber rblkno;
+ int natts = rel->rd_rel->relnatts;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* if we're on a rightmost page, we don't need to move right */
+ if (P_RIGHTMOST(opaque))
+ return (buf);
+
+ /* by convention, item 0 on non-rightmost pages is the high key */
+ hikey = PageGetItemId(page, P_HIKEY);
+
+ /*
+ * If the scan key that brought us to this page is >= the high key
+ * stored on the page, then the page has split and we need to move
+ * right.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTGreaterEqualStrategyNumber))
{
- OffsetNumber offmax = PageGetMaxOffsetNumber(page);
- /*
- * If this page consists of all duplicate keys (hikey and first
- * key on the page have the same value), then we don't need to
- * step right.
- *
- * NOTE for multi-column indices: we may do scan using
- * keys not for all attrs. But we handle duplicates
- * using all attrs in _bt_insert/_bt_spool code.
- * And so we've to compare scankey with _last_ item
- * on this page to do not lose "good" tuples if number
- * of attrs > keysize. Example: (2,0) - last items on
- * this page, (2,1) - first item on next page (hikey),
- * our scankey is x = 2. Scankey == (2,1) because of
- * we compare first attrs only, but we shouldn't to move
- * right of here. - vadim 04/15/97
- */
-
- if ( _bt_skeycmp (rel, keysz, scankey, page, hikey,
- BTEqualStrategyNumber) )
- {
- if ( opaque->btpo_flags & BTP_CHAIN )
- {
- Assert ( ( opaque->btpo_flags & BTP_LEAF ) || offmax > P_HIKEY );
- break;
- }
- if ( offmax > P_HIKEY )
- {
- if ( natts == keysz ) /* sanity checks */
- {
- if ( _bt_skeycmp (rel, keysz, scankey, page,
- PageGetItemId (page, P_FIRSTKEY),
- BTEqualStrategyNumber) )
- elog (FATAL, "btree: BTP_CHAIN flag was expected");
- if ( _bt_skeycmp (rel, keysz, scankey, page,
- PageGetItemId (page, offmax),
- BTEqualStrategyNumber) )
- elog (FATAL, "btree: unexpected equal last item");
- if ( _bt_skeycmp (rel, keysz, scankey, page,
- PageGetItemId (page, offmax),
- BTLessStrategyNumber) )
- elog (FATAL, "btree: unexpected greater last item");
- /* move right */
- }
- else if ( _bt_skeycmp (rel, keysz, scankey, page,
- PageGetItemId (page, offmax),
- BTLessEqualStrategyNumber) )
- break;
- }
- }
-
- /* step right one page */
- rblkno = opaque->btpo_next;
- _bt_relbuf(rel, buf, access);
- buf = _bt_getbuf(rel, rblkno, access);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- hikey = PageGetItemId(page, P_HIKEY);
-
- } while (! P_RIGHTMOST(opaque)
- && _bt_skeycmp(rel, keysz, scankey, page, hikey,
- BTGreaterEqualStrategyNumber));
- }
- return (buf);
+ /* move right as long as we need to */
+ do
+ {
+ OffsetNumber offmax = PageGetMaxOffsetNumber(page);
+
+ /*
+ * If this page consists of all duplicate keys (hikey and
+ * first key on the page have the same value), then we don't
+ * need to step right.
+ *
+ * NOTE for multi-column indices: we may do scan using keys not
+ * for all attrs. But we handle duplicates using all attrs in
+ * _bt_insert/_bt_spool code. And so we've to compare scankey
+ * with _last_ item on this page to do not lose "good" tuples
+ * if number of attrs > keysize. Example: (2,0) - last items
+ * on this page, (2,1) - first item on next page (hikey), our
+ * scankey is x = 2. Scankey == (2,1) because of we compare
+ * first attrs only, but we shouldn't to move right of here.
+ * - vadim 04/15/97
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTEqualStrategyNumber))
+ {
+ if (opaque->btpo_flags & BTP_CHAIN)
+ {
+ Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY);
+ break;
+ }
+ if (offmax > P_HIKEY)
+ {
+ if (natts == keysz) /* sanity checks */
+ {
+ if (_bt_skeycmp(rel, keysz, scankey, page,
+ PageGetItemId(page, P_FIRSTKEY),
+ BTEqualStrategyNumber))
+ elog(FATAL, "btree: BTP_CHAIN flag was expected");
+ if (_bt_skeycmp(rel, keysz, scankey, page,
+ PageGetItemId(page, offmax),
+ BTEqualStrategyNumber))
+ elog(FATAL, "btree: unexpected equal last item");
+ if (_bt_skeycmp(rel, keysz, scankey, page,
+ PageGetItemId(page, offmax),
+ BTLessStrategyNumber))
+ elog(FATAL, "btree: unexpected greater last item");
+ /* move right */
+ }
+ else if (_bt_skeycmp(rel, keysz, scankey, page,
+ PageGetItemId(page, offmax),
+ BTLessEqualStrategyNumber))
+ break;
+ }
+ }
+
+ /* step right one page */
+ rblkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf, access);
+ buf = _bt_getbuf(rel, rblkno, access);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ hikey = PageGetItemId(page, P_HIKEY);
+
+ } while (!P_RIGHTMOST(opaque)
+ && _bt_skeycmp(rel, keysz, scankey, page, hikey,
+ BTGreaterEqualStrategyNumber));
+ }
+ return (buf);
}
/*
- * _bt_skeycmp() -- compare a scan key to a particular item on a page using
- * a requested strategy (<, <=, =, >=, >).
+ * _bt_skeycmp() -- compare a scan key to a particular item on a page using
+ * a requested strategy (<, <=, =, >=, >).
*
- * We ignore the unique OIDs stored in the btree item here. Those
- * numbers are intended for use internally only, in repositioning a
- * scan after a page split. They do not impose any meaningful ordering.
+ * We ignore the unique OIDs stored in the btree item here. Those
+ * numbers are intended for use internally only, in repositioning a
+ * scan after a page split. They do not impose any meaningful ordering.
*
- * The comparison is A <op> B, where A is the scan key and B is the
- * tuple pointed at by itemid on page.
+ * The comparison is A <op> B, where A is the scan key and B is the
+ * tuple pointed at by itemid on page.
*/
bool
_bt_skeycmp(Relation rel,
- Size keysz,
- ScanKey scankey,
- Page page,
- ItemId itemid,
- StrategyNumber strat)
+ Size keysz,
+ ScanKey scankey,
+ Page page,
+ ItemId itemid,
+ StrategyNumber strat)
{
- BTItem item;
- IndexTuple indexTuple;
- TupleDesc tupDes;
- ScanKey entry;
- int i;
- Datum attrDatum;
- Datum keyDatum;
- bool compare;
- bool isNull;
- bool useEqual = false;
- bool keyNull;
-
- if ( strat == BTLessEqualStrategyNumber )
- {
- useEqual = true;
- strat = BTLessStrategyNumber;
- }
- else if ( strat == BTGreaterEqualStrategyNumber )
- {
- useEqual = true;
- strat = BTGreaterStrategyNumber;
- }
-
- item = (BTItem) PageGetItem(page, itemid);
- indexTuple = &(item->bti_itup);
-
- tupDes = RelationGetTupleDescriptor(rel);
-
- /* see if the comparison is true for all of the key attributes */
- for (i=1; i <= keysz; i++) {
-
- entry = &scankey[i-1];
- Assert ( entry->sk_attno == i );
- attrDatum = index_getattr(indexTuple,
- entry->sk_attno,
- tupDes,
- &isNull);
- keyDatum = entry->sk_argument;
-
- /* see comments about NULLs handling in btbuild */
- if ( entry->sk_flags & SK_ISNULL ) /* key is NULL */
+ BTItem item;
+ IndexTuple indexTuple;
+ TupleDesc tupDes;
+ ScanKey entry;
+ int i;
+ Datum attrDatum;
+ Datum keyDatum;
+ bool compare;
+ bool isNull;
+ bool useEqual = false;
+ bool keyNull;
+
+ if (strat == BTLessEqualStrategyNumber)
{
- Assert ( entry->sk_procedure == NullValueRegProcedure );
- keyNull = true;
- if ( isNull )
- compare = ( strat == BTEqualStrategyNumber ) ? true : false;
- else
- compare = ( strat == BTGreaterStrategyNumber ) ? true : false;
- }
- else if ( isNull ) /* key is NOT_NULL and item is NULL */
- {
- keyNull = false;
- compare = ( strat == BTLessStrategyNumber ) ? true : false;
- }
- else
- {
- keyNull = false;
- compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+ useEqual = true;
+ strat = BTLessStrategyNumber;
}
-
- if ( compare ) /* true for one of ">, <, =" */
+ else if (strat == BTGreaterEqualStrategyNumber)
{
- if ( strat != BTEqualStrategyNumber )
- return (true);
+ useEqual = true;
+ strat = BTGreaterStrategyNumber;
}
- else /* false for one of ">, <, =" */
+
+ item = (BTItem) PageGetItem(page, itemid);
+ indexTuple = &(item->bti_itup);
+
+ tupDes = RelationGetTupleDescriptor(rel);
+
+ /* see if the comparison is true for all of the key attributes */
+ for (i = 1; i <= keysz; i++)
{
- if ( strat == BTEqualStrategyNumber )
- return (false);
- /*
- * if original strat was "<=, >=" OR
- * "<, >" but some attribute(s) left
- * - need to test for Equality
- */
- if ( useEqual || i < keysz )
- {
- if ( keyNull || isNull )
- compare = ( keyNull && isNull ) ? true : false;
- else
- compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber,
- keyDatum, attrDatum);
- if ( compare ) /* key' and item' attributes are equal */
- continue; /* - try to compare next attributes */
- }
- return (false);
+
+ entry = &scankey[i - 1];
+ Assert(entry->sk_attno == i);
+ attrDatum = index_getattr(indexTuple,
+ entry->sk_attno,
+ tupDes,
+ &isNull);
+ keyDatum = entry->sk_argument;
+
+ /* see comments about NULLs handling in btbuild */
+ if (entry->sk_flags & SK_ISNULL) /* key is NULL */
+ {
+ Assert(entry->sk_procedure == NullValueRegProcedure);
+ keyNull = true;
+ if (isNull)
+ compare = (strat == BTEqualStrategyNumber) ? true : false;
+ else
+ compare = (strat == BTGreaterStrategyNumber) ? true : false;
+ }
+ else if (isNull) /* key is NOT_NULL and item is NULL */
+ {
+ keyNull = false;
+ compare = (strat == BTLessStrategyNumber) ? true : false;
+ }
+ else
+ {
+ keyNull = false;
+ compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum);
+ }
+
+ if (compare) /* true for one of ">, <, =" */
+ {
+ if (strat != BTEqualStrategyNumber)
+ return (true);
+ }
+ else
+/* false for one of ">, <, =" */
+ {
+ if (strat == BTEqualStrategyNumber)
+ return (false);
+
+ /*
+ * if original strat was "<=, >=" OR "<, >" but some
+ * attribute(s) left - need to test for Equality
+ */
+ if (useEqual || i < keysz)
+ {
+ if (keyNull || isNull)
+ compare = (keyNull && isNull) ? true : false;
+ else
+ compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber,
+ keyDatum, attrDatum);
+ if (compare) /* key' and item' attributes are equal */
+ continue; /* - try to compare next attributes */
+ }
+ return (false);
+ }
}
- }
-
- return (true);
+
+ return (true);
}
/*
- * _bt_binsrch() -- Do a binary search for a key on a particular page.
+ * _bt_binsrch() -- Do a binary search for a key on a particular page.
*
- * The scankey we get has the compare function stored in the procedure
- * entry of each data struct. We invoke this regproc to do the
- * comparison for every key in the scankey. _bt_binsrch() returns
- * the OffsetNumber of the first matching key on the page, or the
- * OffsetNumber at which the matching key would appear if it were
- * on this page.
+ * The scankey we get has the compare function stored in the procedure
+ * entry of each data struct. We invoke this regproc to do the
+ * comparison for every key in the scankey. _bt_binsrch() returns
+ * the OffsetNumber of the first matching key on the page, or the
+ * OffsetNumber at which the matching key would appear if it were
+ * on this page.
*
- * By the time this procedure is called, we're sure we're looking
- * at the right page -- don't need to walk right. _bt_binsrch() has
- * no lock or refcount side effects on the buffer.
+ * By the time this procedure is called, we're sure we're looking
+ * at the right page -- don't need to walk right. _bt_binsrch() has
+ * no lock or refcount side effects on the buffer.
*/
OffsetNumber
_bt_binsrch(Relation rel,
- Buffer buf,
- int keysz,
- ScanKey scankey,
- int srchtype)
+ Buffer buf,
+ int keysz,
+ ScanKey scankey,
+ int srchtype)
{
- TupleDesc itupdesc;
- Page page;
- BTPageOpaque opaque;
- OffsetNumber low, mid, high;
- int natts = rel->rd_rel->relnatts;
- int result;
-
- itupdesc = RelationGetTupleDescriptor(rel);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* by convention, item 1 on any non-rightmost page is the high key */
- low = mid = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- high = PageGetMaxOffsetNumber(page);
-
- /*
- * Since for non-rightmost pages, the first item on the page is the
- * high key, there are two notions of emptiness. One is if nothing
- * appears on the page. The other is if nothing but the high key does.
- * The reason we test high <= low, rather than high == low, is that
- * after vacuuming there may be nothing *but* the high key on a page.
- * In that case, given the scheme above, low = 2 and high = 1.
- */
-
- if ( PageIsEmpty (page) )
- return (low);
- if ( (! P_RIGHTMOST(opaque) && high <= low))
- {
- if ( high < low ||
- (srchtype == BT_DESCENT && !(opaque->btpo_flags & BTP_LEAF)) )
- return (low);
- /* It's insertion and high == low == 2 */
- result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
- if ( result > 0 )
- return ( OffsetNumberNext (low) );
- return (low);
- }
-
- while ((high - low) > 1) {
- mid = low + ((high - low) / 2);
- result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
-
- if (result > 0)
- low = mid;
- else if (result < 0)
- high = mid - 1;
- else
+ TupleDesc itupdesc;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber low,
+ mid,
+ high;
+ int natts = rel->rd_rel->relnatts;
+ int result;
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* by convention, item 1 on any non-rightmost page is the high key */
+ low = mid = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ high = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Since for non-rightmost pages, the first item on the page is the
+ * high key, there are two notions of emptiness. One is if nothing
+ * appears on the page. The other is if nothing but the high key
+ * does. The reason we test high <= low, rather than high == low, is
+ * that after vacuuming there may be nothing *but* the high key on a
+ * page. In that case, given the scheme above, low = 2 and high = 1.
+ */
+
+ if (PageIsEmpty(page))
+ return (low);
+ if ((!P_RIGHTMOST(opaque) && high <= low))
{
- mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, mid);
- /*
- * NOTE for multi-column indices: we may do scan using
- * keys not for all attrs. But we handle duplicates using
- * all attrs in _bt_insert/_bt_spool code. And so while
- * searching on internal pages having number of attrs > keysize
- * we want to point at the last item < the scankey, not at the
- * first item = the scankey (!!!), and let _bt_moveright
- * decide later whether to move right or not (see comments and
- * example there). Note also that INSERTions are not affected
- * by this code (natts == keysz). - vadim 04/15/97
- */
- if ( natts == keysz || opaque->btpo_flags & BTP_LEAF )
- return (mid);
- low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
- if ( mid == low )
- return (mid);
- return (OffsetNumberPrev(mid));
+ if (high < low ||
+ (srchtype == BT_DESCENT && !(opaque->btpo_flags & BTP_LEAF)))
+ return (low);
+ /* It's insertion and high == low == 2 */
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
+ if (result > 0)
+ return (OffsetNumberNext(low));
+ return (low);
}
- }
-
- /*
- * We terminated because the endpoints got too close together. There
- * are two cases to take care of.
- *
- * For non-insertion searches on internal pages, we want to point at
- * the last key <, or first key =, the scankey on the page. This
- * guarantees that we'll descend the tree correctly.
- * (NOTE comments above for multi-column indices).
- *
- * For all other cases, we want to point at the first key >=
- * the scankey on the page. This guarantees that scans and
- * insertions will happen correctly.
- */
-
- if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT)
- { /*
- * We want the last key <, or first key ==, the scan key.
- */
- result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
-
- if (result == 0)
+
+ while ((high - low) > 1)
{
- mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, high);
- /*
- * If natts > keysz we want last item < the scan key.
- * See comments above for multi-column indices.
- */
- if ( natts == keysz )
- return (mid);
- low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
- if ( mid == low )
- return (mid);
- return (OffsetNumberPrev(mid));
+ mid = low + ((high - low) / 2);
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid);
+
+ if (result > 0)
+ low = mid;
+ else if (result < 0)
+ high = mid - 1;
+ else
+ {
+ mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, mid);
+
+ /*
+ * NOTE for multi-column indices: we may do scan using keys
+ * not for all attrs. But we handle duplicates using all attrs
+ * in _bt_insert/_bt_spool code. And so while searching on
+ * internal pages having number of attrs > keysize we want to
+ * point at the last item < the scankey, not at the first item
+ * = the scankey (!!!), and let _bt_moveright decide later
+ * whether to move right or not (see comments and example
+ * there). Note also that INSERTions are not affected by this
+ * code (natts == keysz). - vadim 04/15/97
+ */
+ if (natts == keysz || opaque->btpo_flags & BTP_LEAF)
+ return (mid);
+ low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ if (mid == low)
+ return (mid);
+ return (OffsetNumberPrev(mid));
+ }
+ }
+
+ /*
+ * We terminated because the endpoints got too close together. There
+ * are two cases to take care of.
+ *
+ * For non-insertion searches on internal pages, we want to point at the
+ * last key <, or first key =, the scankey on the page. This
+ * guarantees that we'll descend the tree correctly. (NOTE comments
+ * above for multi-column indices).
+ *
+ * For all other cases, we want to point at the first key >= the scankey
+ * on the page. This guarantees that scans and insertions will happen
+ * correctly.
+ */
+
+ if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT)
+ { /* We want the last key <, or first key
+ * ==, the scan key. */
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+
+ if (result == 0)
+ {
+ mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, high);
+
+ /*
+ * If natts > keysz we want last item < the scan key. See
+ * comments above for multi-column indices.
+ */
+ if (natts == keysz)
+ return (mid);
+ low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ if (mid == low)
+ return (mid);
+ return (OffsetNumberPrev(mid));
+ }
+ else if (result > 0)
+ return (high);
+ else
+ return (low);
}
- else if (result > 0)
- return (high);
- else
- return (low);
- }
- else /* we want the first key >= the scan key */
- {
- result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
- if (result <= 0)
- return (low);
else
+/* we want the first key >= the scan key */
{
- if (low == high)
- return (OffsetNumberNext(low));
-
- result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
- if (result <= 0)
- return (high);
- else
- return (OffsetNumberNext(high));
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, low);
+ if (result <= 0)
+ return (low);
+ else
+ {
+ if (low == high)
+ return (OffsetNumberNext(low));
+
+ result = _bt_compare(rel, itupdesc, page, keysz, scankey, high);
+ if (result <= 0)
+ return (high);
+ else
+ return (OffsetNumberNext(high));
+ }
}
- }
}
-static OffsetNumber
+static OffsetNumber
_bt_firsteq(Relation rel,
- TupleDesc itupdesc,
- Page page,
- Size keysz,
- ScanKey scankey,
- OffsetNumber offnum)
+ TupleDesc itupdesc,
+ Page page,
+ Size keysz,
+ ScanKey scankey,
+ OffsetNumber offnum)
{
- BTPageOpaque opaque;
- OffsetNumber limit;
-
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /* skip the high key, if any */
- limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- /* walk backwards looking for the first key in the chain of duplicates */
- while (offnum > limit
- && _bt_compare(rel, itupdesc, page,
- keysz, scankey, OffsetNumberPrev(offnum)) == 0) {
- offnum = OffsetNumberPrev(offnum);
- }
-
- return (offnum);
+ BTPageOpaque opaque;
+ OffsetNumber limit;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* skip the high key, if any */
+ limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* walk backwards looking for the first key in the chain of duplicates */
+ while (offnum > limit
+ && _bt_compare(rel, itupdesc, page,
+ keysz, scankey, OffsetNumberPrev(offnum)) == 0)
+ {
+ offnum = OffsetNumberPrev(offnum);
+ }
+
+ return (offnum);
}
/*
- * _bt_compare() -- Compare scankey to a particular tuple on the page.
+ * _bt_compare() -- Compare scankey to a particular tuple on the page.
*
- * This routine returns:
- * -1 if scankey < tuple at offnum;
- * 0 if scankey == tuple at offnum;
- * +1 if scankey > tuple at offnum.
+ * This routine returns:
+ * -1 if scankey < tuple at offnum;
+ * 0 if scankey == tuple at offnum;
+ * +1 if scankey > tuple at offnum.
*
- * -- Old comments:
- * In order to avoid having to propagate changes up the tree any time
- * a new minimal key is inserted, the leftmost entry on the leftmost
- * page is less than all possible keys, by definition.
+ * -- Old comments:
+ * In order to avoid having to propagate changes up the tree any time
+ * a new minimal key is inserted, the leftmost entry on the leftmost
+ * page is less than all possible keys, by definition.
*
- * -- New ones:
- * New insertion code (fix against updating _in_place_ if new minimal
- * key has bigger size than old one) may delete P_HIKEY entry on the
- * root page in order to insert new minimal key - and so this definition
- * does not work properly in this case and breaks key' order on root
- * page. BTW, this propagation occures only while page' splitting,
- * but not "any time a new min key is inserted" (see _bt_insertonpg).
- * - vadim 12/05/96
+ * -- New ones:
+ * New insertion code (fix against updating _in_place_ if new minimal
+ * key has bigger size than old one) may delete P_HIKEY entry on the
+ * root page in order to insert new minimal key - and so this definition
+ * does not work properly in this case and breaks key' order on root
+ * page. BTW, this propagation occures only while page' splitting,
+ * but not "any time a new min key is inserted" (see _bt_insertonpg).
+ * - vadim 12/05/96
*/
static int
_bt_compare(Relation rel,
- TupleDesc itupdesc,
- Page page,
- int keysz,
- ScanKey scankey,
- OffsetNumber offnum)
+ TupleDesc itupdesc,
+ Page page,
+ int keysz,
+ ScanKey scankey,
+ OffsetNumber offnum)
{
- Datum datum;
- BTItem btitem;
- ItemId itemid;
- IndexTuple itup;
- BTPageOpaque opaque;
- ScanKey entry;
- AttrNumber attno;
- int result;
- int i;
- bool null;
-
- /*
- * If this is a leftmost internal page, and if our comparison is
- * with the first key on the page, then the item at that position is
- * by definition less than the scan key.
- *
- * - see new comments above...
- */
-
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (!(opaque->btpo_flags & BTP_LEAF)
- && P_LEFTMOST(opaque)
- && offnum == P_HIKEY) {
- itemid = PageGetItemId(page, offnum);
-
+ Datum datum;
+ BTItem btitem;
+ ItemId itemid;
+ IndexTuple itup;
+ BTPageOpaque opaque;
+ ScanKey entry;
+ AttrNumber attno;
+ int result;
+ int i;
+ bool null;
+
/*
- * we just have to believe that this will only be called with
- * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the
- * first actual data key (i.e., this is also a rightmost
- * page). there doesn't seem to be any code that implies
- * that the leftmost page is normally missing a high key as
- * well as the rightmost page. but that implies that this
- * code path only applies to the root -- which seems
- * unlikely..
+ * If this is a leftmost internal page, and if our comparison is with
+ * the first key on the page, then the item at that position is by
+ * definition less than the scan key.
*
- * - see new comments above...
+ * - see new comments above...
*/
- if (! P_RIGHTMOST(opaque)) {
- elog(WARN, "_bt_compare: invalid comparison to high key");
- }
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (!(opaque->btpo_flags & BTP_LEAF)
+ && P_LEFTMOST(opaque)
+ && offnum == P_HIKEY)
+ {
+ itemid = PageGetItemId(page, offnum);
+
+ /*
+ * we just have to believe that this will only be called with
+ * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the first
+ * actual data key (i.e., this is also a rightmost page). there
+ * doesn't seem to be any code that implies that the leftmost page
+ * is normally missing a high key as well as the rightmost page.
+ * but that implies that this code path only applies to the root
+ * -- which seems unlikely..
+ *
+ * - see new comments above...
+ */
+ if (!P_RIGHTMOST(opaque))
+ {
+ elog(WARN, "_bt_compare: invalid comparison to high key");
+ }
#if 0
+
+ /*
+ * We just have to belive that right answer will not break
+ * anything. I've checked code and all seems to be ok. See new
+ * comments above...
+ *
+ * -- Old comments If the item on the page is equal to the scankey,
+ * that's okay to admit. We just can't claim that the first key
+ * on the page is greater than anything.
+ */
+
+ if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
+ BTEqualStrategyNumber))
+ {
+ return (0);
+ }
+ return (1);
+#endif
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
/*
- * We just have to belive that right answer will not
- * break anything. I've checked code and all seems to be ok.
- * See new comments above...
+ * The scan key is set up with the attribute number associated with
+ * each term in the key. It is important that, if the index is
+ * multi-key, the scan contain the first k key attributes, and that
+ * they be in order. If you think about how multi-key ordering works,
+ * you'll understand why this is.
*
- * -- Old comments
- * If the item on the page is equal to the scankey, that's
- * okay to admit. We just can't claim that the first key on
- * the page is greater than anything.
+ * We don't test for violation of this condition here.
*/
-
- if (_bt_skeycmp(rel, keysz, scankey, page, itemid,
- BTEqualStrategyNumber)) {
- return (0);
- }
- return (1);
-#endif
- }
-
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &(btitem->bti_itup);
-
- /*
- * The scan key is set up with the attribute number associated with each
- * term in the key. It is important that, if the index is multi-key,
- * the scan contain the first k key attributes, and that they be in
- * order. If you think about how multi-key ordering works, you'll
- * understand why this is.
- *
- * We don't test for violation of this condition here.
- */
-
- for (i = 1; i <= keysz; i++) {
- long tmpres;
-
- entry = &scankey[i - 1];
- attno = entry->sk_attno;
- datum = index_getattr(itup, attno, itupdesc, &null);
-
- /* see comments about NULLs handling in btbuild */
- if ( entry->sk_flags & SK_ISNULL ) /* key is NULL */
+
+ for (i = 1; i <= keysz; i++)
{
- Assert ( entry->sk_procedure == NullValueRegProcedure );
- if ( null )
- tmpres = (long) 0; /* NULL "=" NULL */
- else
- tmpres = (long) 1; /* NULL ">" NOT_NULL */
- }
- else if ( null ) /* key is NOT_NULL and item is NULL */
- {
- tmpres = (long) -1; /* NOT_NULL "<" NULL */
- }
- else
- {
- tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
- entry->sk_argument, datum);
+ long tmpres;
+
+ entry = &scankey[i - 1];
+ attno = entry->sk_attno;
+ datum = index_getattr(itup, attno, itupdesc, &null);
+
+ /* see comments about NULLs handling in btbuild */
+ if (entry->sk_flags & SK_ISNULL) /* key is NULL */
+ {
+ Assert(entry->sk_procedure == NullValueRegProcedure);
+ if (null)
+ tmpres = (long) 0; /* NULL "=" NULL */
+ else
+ tmpres = (long) 1; /* NULL ">" NOT_NULL */
+ }
+ else if (null) /* key is NOT_NULL and item is NULL */
+ {
+ tmpres = (long) -1; /* NOT_NULL "<" NULL */
+ }
+ else
+ {
+ tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure,
+ entry->sk_argument, datum);
+ }
+ result = tmpres;
+
+ /* if the keys are unequal, return the difference */
+ if (result != 0)
+ return (result);
}
- result = tmpres;
-
- /* if the keys are unequal, return the difference */
- if (result != 0)
- return (result);
- }
-
- /* by here, the keys are equal */
- return (0);
+
+ /* by here, the keys are equal */
+ return (0);
}
/*
- * _bt_next() -- Get the next item in a scan.
+ * _bt_next() -- Get the next item in a scan.
*
- * On entry, we have a valid currentItemData in the scan, and a
- * read lock on the page that contains that item. We do not have
- * the page pinned. We return the next item in the scan. On
- * exit, we have the page containing the next item locked but not
- * pinned.
+ * On entry, we have a valid currentItemData in the scan, and a
+ * read lock on the page that contains that item. We do not have
+ * the page pinned. We return the next item in the scan. On
+ * exit, we have the page containing the next item locked but not
+ * pinned.
*/
RetrieveIndexResult
_bt_next(IndexScanDesc scan, ScanDirection dir)
{
- Relation rel;
- Buffer buf;
- Page page;
- OffsetNumber offnum;
- RetrieveIndexResult res;
- ItemPointer current;
- BTItem btitem;
- IndexTuple itup;
- BTScanOpaque so;
- Size keysok;
-
- rel = scan->relation;
- so = (BTScanOpaque) scan->opaque;
- current = &(scan->currentItemData);
-
- /*
- * XXX 10 may 91: somewhere there's a bug in our management of the
- * cached buffer for this scan. wei discovered it. the following
- * is a workaround so he can work until i figure out what's going on.
- */
-
- if (!BufferIsValid(so->btso_curbuf))
- so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current),
- BT_READ);
-
- /* we still have the buffer pinned and locked */
- buf = so->btso_curbuf;
-
- do
- {
- /* step one tuple in the appropriate direction */
- if (!_bt_step(scan, &buf, dir))
- return ((RetrieveIndexResult) NULL);
-
- /* by here, current is the tuple we want to return */
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &btitem->bti_itup;
-
- if ( _bt_checkkeys (scan, itup, &keysok) )
- {
- Assert (keysok == so->numberOfKeys);
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
-
- /* remember which buffer we have pinned and locked */
- so->btso_curbuf = buf;
- return (res);
- }
+ Relation rel;
+ Buffer buf;
+ Page page;
+ OffsetNumber offnum;
+ RetrieveIndexResult res;
+ ItemPointer current;
+ BTItem btitem;
+ IndexTuple itup;
+ BTScanOpaque so;
+ Size keysok;
+
+ rel = scan->relation;
+ so = (BTScanOpaque) scan->opaque;
+ current = &(scan->currentItemData);
+
+ /*
+ * XXX 10 may 91: somewhere there's a bug in our management of the
+ * cached buffer for this scan. wei discovered it. the following is
+ * a workaround so he can work until i figure out what's going on.
+ */
+
+ if (!BufferIsValid(so->btso_curbuf))
+ so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current),
+ BT_READ);
+
+ /* we still have the buffer pinned and locked */
+ buf = so->btso_curbuf;
+
+ do
+ {
+ /* step one tuple in the appropriate direction */
+ if (!_bt_step(scan, &buf, dir))
+ return ((RetrieveIndexResult) NULL);
- } while ( keysok >= so->numberOfFirstKeys );
+ /* by here, current is the tuple we want to return */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+
+ if (_bt_checkkeys(scan, itup, &keysok))
+ {
+ Assert(keysok == so->numberOfKeys);
+ res = FormRetrieveIndexResult(current, &(itup->t_tid));
+
+ /* remember which buffer we have pinned and locked */
+ so->btso_curbuf = buf;
+ return (res);
+ }
+
+ } while (keysok >= so->numberOfFirstKeys);
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
-
- return ((RetrieveIndexResult) NULL);
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+
+ return ((RetrieveIndexResult) NULL);
}
/*
- * _bt_first() -- Find the first item in a scan.
+ * _bt_first() -- Find the first item in a scan.
*
- * We need to be clever about the type of scan, the operation it's
- * performing, and the tree ordering. We return the RetrieveIndexResult
- * of the first item in the tree that satisfies the qualification
- * associated with the scan descriptor. On exit, the page containing
- * the current index tuple is read locked and pinned, and the scan's
- * opaque data entry is updated to include the buffer.
+ * We need to be clever about the type of scan, the operation it's
+ * performing, and the tree ordering. We return the RetrieveIndexResult
+ * of the first item in the tree that satisfies the qualification
+ * associated with the scan descriptor. On exit, the page containing
+ * the current index tuple is read locked and pinned, and the scan's
+ * opaque data entry is updated to include the buffer.
*/
RetrieveIndexResult
_bt_first(IndexScanDesc scan, ScanDirection dir)
{
- Relation rel;
- TupleDesc itupdesc;
- Buffer buf;
- Page page;
- BTPageOpaque pop;
- BTStack stack;
- OffsetNumber offnum, maxoff;
- bool offGmax = false;
- BTItem btitem;
- IndexTuple itup;
- ItemPointer current;
- BlockNumber blkno;
- StrategyNumber strat;
- RetrieveIndexResult res;
- RegProcedure proc;
- int result;
- BTScanOpaque so;
- ScanKeyData skdata;
- Size keysok;
-
- rel = scan->relation;
- so = (BTScanOpaque) scan->opaque;
-
- /*
- * Order the keys in the qualification and be sure
- * that the scan exploits the tree order.
- */
- so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */
- so->qual_ok = 1; /* may be changed by _bt_orderkeys */
- scan->scanFromEnd = false;
- if ( so->numberOfKeys > 0 )
- {
- _bt_orderkeys(rel, so);
-
- strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure);
+ Relation rel;
+ TupleDesc itupdesc;
+ Buffer buf;
+ Page page;
+ BTPageOpaque pop;
+ BTStack stack;
+ OffsetNumber offnum,
+ maxoff;
+ bool offGmax = false;
+ BTItem btitem;
+ IndexTuple itup;
+ ItemPointer current;
+ BlockNumber blkno;
+ StrategyNumber strat;
+ RetrieveIndexResult res;
+ RegProcedure proc;
+ int result;
+ BTScanOpaque so;
+ ScanKeyData skdata;
+ Size keysok;
- /* NOTE: it assumes ForwardScanDirection */
- if ( strat == BTLessStrategyNumber ||
- strat == BTLessEqualStrategyNumber )
- scan->scanFromEnd = true;
- }
- else
- scan->scanFromEnd = true;
-
- if ( so->qual_ok == 0 )
- return ((RetrieveIndexResult) NULL);
-
- /* if we just need to walk down one edge of the tree, do that */
- if (scan->scanFromEnd)
- return (_bt_endpoint(scan, dir));
-
- itupdesc = RelationGetTupleDescriptor(rel);
- current = &(scan->currentItemData);
-
- /*
- * Okay, we want something more complicated. What we'll do is use
- * the first item in the scan key passed in (which has been correctly
- * ordered to take advantage of index ordering) to position ourselves
- * at the right place in the scan.
- */
- /* _bt_orderkeys disallows it, but it's place to add some code latter */
- if ( so->keyData[0].sk_flags & SK_ISNULL )
- {
- elog (WARN, "_bt_first: btree doesn't support is(not)null, yet");
- return ((RetrieveIndexResult) NULL);
- }
- proc = index_getprocid(rel, 1, BTORDER_PROC);
- ScanKeyEntryInitialize(&skdata, so->keyData[0].sk_flags, 1, proc,
- so->keyData[0].sk_argument);
-
- stack = _bt_search(rel, 1, &skdata, &buf);
- _bt_freestack(stack);
-
- blkno = BufferGetBlockNumber(buf);
- page = BufferGetPage(buf);
-
- /*
- * This will happen if the tree we're searching is entirely empty,
- * or if we're doing a search for a key that would appear on an
- * entirely empty internal page. In either case, there are no
- * matching tuples in the index.
- */
-
- if (PageIsEmpty(page)) {
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- return ((RetrieveIndexResult) NULL);
- }
- maxoff = PageGetMaxOffsetNumber(page);
- pop = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /*
- * Now _bt_moveright doesn't move from non-rightmost leaf page
- * if scankey == hikey and there is only hikey there. It's
- * good for insertion, but we need to do work for scan here.
- * - vadim 05/27/97
- */
-
- while ( maxoff == P_HIKEY && !P_RIGHTMOST(pop) &&
- _bt_skeycmp(rel, 1, &skdata, page,
- PageGetItemId(page, P_HIKEY),
- BTGreaterEqualStrategyNumber) )
- {
- /* step right one page */
- blkno = pop->btpo_next;
- _bt_relbuf(rel, buf, BT_READ);
- buf = _bt_getbuf(rel, blkno, BT_READ);
+ rel = scan->relation;
+ so = (BTScanOpaque) scan->opaque;
+
+ /*
+ * Order the keys in the qualification and be sure that the scan
+ * exploits the tree order.
+ */
+ so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */
+ so->qual_ok = 1; /* may be changed by _bt_orderkeys */
+ scan->scanFromEnd = false;
+ if (so->numberOfKeys > 0)
+ {
+ _bt_orderkeys(rel, so);
+
+ strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure);
+
+ /* NOTE: it assumes ForwardScanDirection */
+ if (strat == BTLessStrategyNumber ||
+ strat == BTLessEqualStrategyNumber)
+ scan->scanFromEnd = true;
+ }
+ else
+ scan->scanFromEnd = true;
+
+ if (so->qual_ok == 0)
+ return ((RetrieveIndexResult) NULL);
+
+ /* if we just need to walk down one edge of the tree, do that */
+ if (scan->scanFromEnd)
+ return (_bt_endpoint(scan, dir));
+
+ itupdesc = RelationGetTupleDescriptor(rel);
+ current = &(scan->currentItemData);
+
+ /*
+ * Okay, we want something more complicated. What we'll do is use the
+ * first item in the scan key passed in (which has been correctly
+ * ordered to take advantage of index ordering) to position ourselves
+ * at the right place in the scan.
+ */
+ /* _bt_orderkeys disallows it, but it's place to add some code latter */
+ if (so->keyData[0].sk_flags & SK_ISNULL)
+ {
+ elog(WARN, "_bt_first: btree doesn't support is(not)null, yet");
+ return ((RetrieveIndexResult) NULL);
+ }
+ proc = index_getprocid(rel, 1, BTORDER_PROC);
+ ScanKeyEntryInitialize(&skdata, so->keyData[0].sk_flags, 1, proc,
+ so->keyData[0].sk_argument);
+
+ stack = _bt_search(rel, 1, &skdata, &buf);
+ _bt_freestack(stack);
+
+ blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
- if (PageIsEmpty(page)) {
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- return ((RetrieveIndexResult) NULL);
+
+ /*
+ * This will happen if the tree we're searching is entirely empty, or
+ * if we're doing a search for a key that would appear on an entirely
+ * empty internal page. In either case, there are no matching tuples
+ * in the index.
+ */
+
+ if (PageIsEmpty(page))
+ {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ return ((RetrieveIndexResult) NULL);
}
- maxoff = PageGetMaxOffsetNumber(page);
+ maxoff = PageGetMaxOffsetNumber(page);
pop = (BTPageOpaque) PageGetSpecialPointer(page);
- }
-
-
- /* find the nearest match to the manufactured scan key on the page */
- offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT);
-
- if (offnum > maxoff)
- {
- offnum = maxoff;
- offGmax = true;
- }
-
- ItemPointerSet(current, blkno, offnum);
-
- /*
- * Now find the right place to start the scan. Result is the
- * value we're looking for minus the value we're looking at
- * in the index.
- */
-
- result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
-
- /* it's yet other place to add some code latter for is(not)null */
-
- strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure);
-
- switch (strat) {
- case BTLessStrategyNumber:
- if (result <= 0) {
- do {
- if (!_bt_twostep(scan, &buf, BackwardScanDirection))
- break;
-
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
- } while (result <= 0);
-
- /* if this is true, the key we just looked at is gone */
- if (result > 0)
- _bt_twostep(scan, &buf, ForwardScanDirection);
- }
- break;
-
- case BTLessEqualStrategyNumber:
- if (result >= 0) {
- do {
- if (!_bt_twostep(scan, &buf, ForwardScanDirection))
- break;
-
- offnum = ItemPointerGetOffsetNumber(current);
+
+ /*
+ * Now _bt_moveright doesn't move from non-rightmost leaf page if
+ * scankey == hikey and there is only hikey there. It's good for
+ * insertion, but we need to do work for scan here. - vadim 05/27/97
+ */
+
+ while (maxoff == P_HIKEY && !P_RIGHTMOST(pop) &&
+ _bt_skeycmp(rel, 1, &skdata, page,
+ PageGetItemId(page, P_HIKEY),
+ BTGreaterEqualStrategyNumber))
+ {
+ /* step right one page */
+ blkno = pop->btpo_next;
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
- } while (result >= 0);
-
- if (result < 0)
- _bt_twostep(scan, &buf, BackwardScanDirection);
+ if (PageIsEmpty(page))
+ {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ return ((RetrieveIndexResult) NULL);
+ }
+ maxoff = PageGetMaxOffsetNumber(page);
+ pop = (BTPageOpaque) PageGetSpecialPointer(page);
}
- break;
-
- case BTEqualStrategyNumber:
- if (result != 0) {
- _bt_relbuf(scan->relation, buf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(&(scan->currentItemData));
- return ((RetrieveIndexResult) NULL);
+
+
+ /* find the nearest match to the manufactured scan key on the page */
+ offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT);
+
+ if (offnum > maxoff)
+ {
+ offnum = maxoff;
+ offGmax = true;
}
- break;
-
- case BTGreaterEqualStrategyNumber:
- if ( offGmax )
+
+ ItemPointerSet(current, blkno, offnum);
+
+ /*
+ * Now find the right place to start the scan. Result is the value
+ * we're looking for minus the value we're looking at in the index.
+ */
+
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+
+ /* it's yet other place to add some code latter for is(not)null */
+
+ strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure);
+
+ switch (strat)
{
- if (result < 0)
- {
- Assert ( !P_RIGHTMOST(pop) && maxoff == P_HIKEY );
- if ( !_bt_step(scan, &buf, ForwardScanDirection) )
- {
- _bt_relbuf(scan->relation, buf, BT_READ);
- so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(&(scan->currentItemData));
- return ((RetrieveIndexResult) NULL);
+ case BTLessStrategyNumber:
+ if (result <= 0)
+ {
+ do
+ {
+ if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result <= 0);
+
+ /* if this is true, the key we just looked at is gone */
+ if (result > 0)
+ _bt_twostep(scan, &buf, ForwardScanDirection);
}
- }
- else if (result > 0)
- { /*
- * Just remember: _bt_binsrch() returns the OffsetNumber of
- * the first matching key on the page, or the OffsetNumber at
- * which the matching key WOULD APPEAR IF IT WERE on this page.
- * No key on this page, but offnum from _bt_binsrch() greater
- * maxoff - have to move right. - vadim 12/06/96
- */
- _bt_twostep(scan, &buf, ForwardScanDirection);
- }
+ break;
+
+ case BTLessEqualStrategyNumber:
+ if (result >= 0)
+ {
+ do
+ {
+ if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result >= 0);
+
+ if (result < 0)
+ _bt_twostep(scan, &buf, BackwardScanDirection);
+ }
+ break;
+
+ case BTEqualStrategyNumber:
+ if (result != 0)
+ {
+ _bt_relbuf(scan->relation, buf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(&(scan->currentItemData));
+ return ((RetrieveIndexResult) NULL);
+ }
+ break;
+
+ case BTGreaterEqualStrategyNumber:
+ if (offGmax)
+ {
+ if (result < 0)
+ {
+ Assert(!P_RIGHTMOST(pop) && maxoff == P_HIKEY);
+ if (!_bt_step(scan, &buf, ForwardScanDirection))
+ {
+ _bt_relbuf(scan->relation, buf, BT_READ);
+ so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(&(scan->currentItemData));
+ return ((RetrieveIndexResult) NULL);
+ }
+ }
+ else if (result > 0)
+ { /* Just remember: _bt_binsrch() returns
+ * the OffsetNumber of the first matching
+ * key on the page, or the OffsetNumber at
+ * which the matching key WOULD APPEAR IF
+ * IT WERE on this page. No key on this
+ * page, but offnum from _bt_binsrch()
+ * greater maxoff - have to move right. -
+ * vadim 12/06/96 */
+ _bt_twostep(scan, &buf, ForwardScanDirection);
+ }
+ }
+ else if (result < 0)
+ {
+ do
+ {
+ if (!_bt_twostep(scan, &buf, BackwardScanDirection))
+ break;
+
+ page = BufferGetPage(buf);
+ offnum = ItemPointerGetOffsetNumber(current);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result < 0);
+
+ if (result > 0)
+ _bt_twostep(scan, &buf, ForwardScanDirection);
+ }
+ break;
+
+ case BTGreaterStrategyNumber:
+ /* offGmax helps as above */
+ if (result >= 0 || offGmax)
+ {
+ do
+ {
+ if (!_bt_twostep(scan, &buf, ForwardScanDirection))
+ break;
+
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
+ } while (result >= 0);
+ }
+ break;
}
- else if (result < 0)
+
+ /* okay, current item pointer for the scan is right */
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &btitem->bti_itup;
+
+ if (_bt_checkkeys(scan, itup, &keysok))
{
- do {
- if (!_bt_twostep(scan, &buf, BackwardScanDirection))
- break;
-
- page = BufferGetPage(buf);
- offnum = ItemPointerGetOffsetNumber(current);
- result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
- } while (result < 0);
-
- if (result > 0)
- _bt_twostep(scan, &buf, ForwardScanDirection);
+ res = FormRetrieveIndexResult(current, &(itup->t_tid));
+
+ /* remember which buffer we have pinned */
+ so->btso_curbuf = buf;
}
- break;
-
- case BTGreaterStrategyNumber:
- /* offGmax helps as above */
- if (result >= 0 || offGmax) {
- do {
- if (!_bt_twostep(scan, &buf, ForwardScanDirection))
- break;
-
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum);
- } while (result >= 0);
+ else if (keysok >= so->numberOfFirstKeys)
+ {
+ so->btso_curbuf = buf;
+ return (_bt_next(scan, dir));
}
- break;
- }
-
- /* okay, current item pointer for the scan is right */
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &btitem->bti_itup;
-
- if ( _bt_checkkeys (scan, itup, &keysok) )
- {
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
-
- /* remember which buffer we have pinned */
- so->btso_curbuf = buf;
- }
- else if ( keysok >= so->numberOfFirstKeys )
- {
- so->btso_curbuf = buf;
- return (_bt_next (scan, dir));
- }
- else
- {
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- res = (RetrieveIndexResult) NULL;
- }
-
- return (res);
+ else
+ {
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
+ }
+
+ return (res);
}
/*
- * _bt_step() -- Step one item in the requested direction in a scan on
- * the tree.
+ * _bt_step() -- Step one item in the requested direction in a scan on
+ * the tree.
*
- * If no adjacent record exists in the requested direction, return
- * false. Else, return true and set the currentItemData for the
- * scan to the right thing.
+ * If no adjacent record exists in the requested direction, return
+ * false. Else, return true and set the currentItemData for the
+ * scan to the right thing.
*/
bool
-_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+_bt_step(IndexScanDesc scan, Buffer * bufP, ScanDirection dir)
{
- Page page;
- BTPageOpaque opaque;
- OffsetNumber offnum, maxoff;
- OffsetNumber start;
- BlockNumber blkno;
- BlockNumber obknum;
- BTScanOpaque so;
- ItemPointer current;
- Relation rel;
-
- rel = scan->relation;
- current = &(scan->currentItemData);
- offnum = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- so = (BTScanOpaque) scan->opaque;
- maxoff = PageGetMaxOffsetNumber(page);
-
- /* get the next tuple */
- if (ScanDirectionIsForward(dir)) {
- if (!PageIsEmpty(page) && offnum < maxoff) {
- offnum = OffsetNumberNext(offnum);
- } else {
-
- /* if we're at end of scan, release the buffer and return */
- blkno = opaque->btpo_next;
- if (P_RIGHTMOST(opaque)) {
- _bt_relbuf(rel, *bufP, BT_READ);
- ItemPointerSetInvalid(current);
- *bufP = so->btso_curbuf = InvalidBuffer;
- return (false);
- } else {
-
- /* walk right to the next page with data */
- _bt_relbuf(rel, *bufP, BT_READ);
- for (;;) {
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- if (!PageIsEmpty(page) && start <= maxoff) {
- break;
- } else {
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum,
+ maxoff;
+ OffsetNumber start;
+ BlockNumber blkno;
+ BlockNumber obknum;
+ BTScanOpaque so;
+ ItemPointer current;
+ Relation rel;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ so = (BTScanOpaque) scan->opaque;
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /* get the next tuple */
+ if (ScanDirectionIsForward(dir))
+ {
+ if (!PageIsEmpty(page) && offnum < maxoff)
+ {
+ offnum = OffsetNumberNext(offnum);
+ }
+ else
+ {
+
+ /* if we're at end of scan, release the buffer and return */
blkno = opaque->btpo_next;
- _bt_relbuf(rel, *bufP, BT_READ);
- if (blkno == P_NONE) {
- *bufP = so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(current);
- return (false);
+ if (P_RIGHTMOST(opaque))
+ {
+ _bt_relbuf(rel, *bufP, BT_READ);
+ ItemPointerSetInvalid(current);
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ return (false);
+ }
+ else
+ {
+
+ /* walk right to the next page with data */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ for (;;)
+ {
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (!PageIsEmpty(page) && start <= maxoff)
+ {
+ break;
+ }
+ else
+ {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, *bufP, BT_READ);
+ if (blkno == P_NONE)
+ {
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ }
+ }
+ offnum = start;
}
- }
}
- offnum = start;
- }
}
- } else if (ScanDirectionIsBackward(dir)) {
-
- /* remember that high key is item zero on non-rightmost pages */
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ else if (ScanDirectionIsBackward(dir))
+ {
- if (offnum > start) {
- offnum = OffsetNumberPrev(offnum);
- } else {
-
- /* if we're at end of scan, release the buffer and return */
- blkno = opaque->btpo_prev;
- if (P_LEFTMOST(opaque)) {
- _bt_relbuf(rel, *bufP, BT_READ);
- *bufP = so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(current);
- return (false);
- } else {
-
- obknum = BufferGetBlockNumber(*bufP);
-
- /* walk right to the next page with data */
- _bt_relbuf(rel, *bufP, BT_READ);
- for (;;) {
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
-
- /*
- * If the adjacent page just split, then we may have the
- * wrong block. Handle this case. Because pages only
- * split right, we don't have to worry about this failing
- * to terminate.
- */
-
- while (opaque->btpo_next != obknum) {
- blkno = opaque->btpo_next;
- _bt_relbuf(rel, *bufP, BT_READ);
- *bufP = _bt_getbuf(rel, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- }
-
- /* don't consider the high key */
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- /* anything to look at here? */
- if (!PageIsEmpty(page) && maxoff >= start) {
- break;
- } else {
+ /* remember that high key is item zero on non-rightmost pages */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ if (offnum > start)
+ {
+ offnum = OffsetNumberPrev(offnum);
+ }
+ else
+ {
+
+ /* if we're at end of scan, release the buffer and return */
blkno = opaque->btpo_prev;
- obknum = BufferGetBlockNumber(*bufP);
- _bt_relbuf(rel, *bufP, BT_READ);
- if (blkno == P_NONE) {
- *bufP = so->btso_curbuf = InvalidBuffer;
- ItemPointerSetInvalid(current);
- return (false);
+ if (P_LEFTMOST(opaque))
+ {
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ else
+ {
+
+ obknum = BufferGetBlockNumber(*bufP);
+
+ /* walk right to the next page with data */
+ _bt_relbuf(rel, *bufP, BT_READ);
+ for (;;)
+ {
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * If the adjacent page just split, then we may have
+ * the wrong block. Handle this case. Because pages
+ * only split right, we don't have to worry about this
+ * failing to terminate.
+ */
+
+ while (opaque->btpo_next != obknum)
+ {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, *bufP, BT_READ);
+ *bufP = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
+
+ /* don't consider the high key */
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* anything to look at here? */
+ if (!PageIsEmpty(page) && maxoff >= start)
+ {
+ break;
+ }
+ else
+ {
+ blkno = opaque->btpo_prev;
+ obknum = BufferGetBlockNumber(*bufP);
+ _bt_relbuf(rel, *bufP, BT_READ);
+ if (blkno == P_NONE)
+ {
+ *bufP = so->btso_curbuf = InvalidBuffer;
+ ItemPointerSetInvalid(current);
+ return (false);
+ }
+ }
+ }
+ offnum = maxoff;/* XXX PageIsEmpty? */
}
- }
}
- offnum = maxoff; /* XXX PageIsEmpty? */
- }
}
- }
- blkno = BufferGetBlockNumber(*bufP);
- so->btso_curbuf = *bufP;
- ItemPointerSet(current, blkno, offnum);
-
- return (true);
+ blkno = BufferGetBlockNumber(*bufP);
+ so->btso_curbuf = *bufP;
+ ItemPointerSet(current, blkno, offnum);
+
+ return (true);
}
/*
- * _bt_twostep() -- Move to an adjacent record in a scan on the tree,
- * if an adjacent record exists.
+ * _bt_twostep() -- Move to an adjacent record in a scan on the tree,
+ * if an adjacent record exists.
*
- * This is like _bt_step, except that if no adjacent record exists
- * it restores us to where we were before trying the step. This is
- * only hairy when you cross page boundaries, since the page you cross
- * from could have records inserted or deleted, or could even split.
- * This is unlikely, but we try to handle it correctly here anyway.
+ * This is like _bt_step, except that if no adjacent record exists
+ * it restores us to where we were before trying the step. This is
+ * only hairy when you cross page boundaries, since the page you cross
+ * from could have records inserted or deleted, or could even split.
+ * This is unlikely, but we try to handle it correctly here anyway.
*
- * This routine contains the only case in which our changes to Lehman
- * and Yao's algorithm.
+ * This routine contains the only case in which our changes to Lehman
+ * and Yao's algorithm.
*
- * Like step, this routine leaves the scan's currentItemData in the
- * proper state and acquires a lock and pin on *bufP. If the twostep
- * succeeded, we return true; otherwise, we return false.
+ * Like step, this routine leaves the scan's currentItemData in the
+ * proper state and acquires a lock and pin on *bufP. If the twostep
+ * succeeded, we return true; otherwise, we return false.
*/
-static bool
-_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+static bool
+_bt_twostep(IndexScanDesc scan, Buffer * bufP, ScanDirection dir)
{
- Page page;
- BTPageOpaque opaque;
- OffsetNumber offnum, maxoff;
- OffsetNumber start;
- ItemPointer current;
- ItemId itemid;
- int itemsz;
- BTItem btitem;
- BTItem svitem;
- BlockNumber blkno;
-
- blkno = BufferGetBlockNumber(*bufP);
- page = BufferGetPage(*bufP);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
- current = &(scan->currentItemData);
- offnum = ItemPointerGetOffsetNumber(current);
-
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
-
- /* if we're safe, just do it */
- if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */
- ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
- return (true);
- } else if (ScanDirectionIsBackward(dir) && offnum > start) {
- ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
- return (true);
- }
-
- /* if we've hit end of scan we don't have to do any work */
- if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) {
- return (false);
- } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) {
- return (false);
- }
-
- /*
- * Okay, it's off the page; let _bt_step() do the hard work, and we'll
- * try to remember where we were. This is not guaranteed to work; this
- * is the only place in the code where concurrency can screw us up,
- * and it's because we want to be able to move in two directions in
- * the scan.
- */
-
- itemid = PageGetItemId(page, offnum);
- itemsz = ItemIdGetLength(itemid);
- btitem = (BTItem) PageGetItem(page, itemid);
- svitem = (BTItem) palloc(itemsz);
- memmove((char *) svitem, (char *) btitem, itemsz);
-
- if (_bt_step(scan, bufP, dir)) {
- pfree(svitem);
- return (true);
- }
-
- /* try to find our place again */
- *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
- page = BufferGetPage(*bufP);
- maxoff = PageGetMaxOffsetNumber(page);
-
- while (offnum <= maxoff) {
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber offnum,
+ maxoff;
+ OffsetNumber start;
+ ItemPointer current;
+ ItemId itemid;
+ int itemsz;
+ BTItem btitem;
+ BTItem svitem;
+ BlockNumber blkno;
+
+ blkno = BufferGetBlockNumber(*bufP);
+ page = BufferGetPage(*bufP);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ current = &(scan->currentItemData);
+ offnum = ItemPointerGetOffsetNumber(current);
+
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /* if we're safe, just do it */
+ if (ScanDirectionIsForward(dir) && offnum < maxoff)
+ { /* XXX PageIsEmpty? */
+ ItemPointerSet(current, blkno, OffsetNumberNext(offnum));
+ return (true);
+ }
+ else if (ScanDirectionIsBackward(dir) && offnum > start)
+ {
+ ItemPointerSet(current, blkno, OffsetNumberPrev(offnum));
+ return (true);
+ }
+
+ /* if we've hit end of scan we don't have to do any work */
+ if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque))
+ {
+ return (false);
+ }
+ else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque))
+ {
+ return (false);
+ }
+
+ /*
+ * Okay, it's off the page; let _bt_step() do the hard work, and we'll
+ * try to remember where we were. This is not guaranteed to work;
+ * this is the only place in the code where concurrency can screw us
+ * up, and it's because we want to be able to move in two directions
+ * in the scan.
+ */
+
itemid = PageGetItemId(page, offnum);
+ itemsz = ItemIdGetLength(itemid);
btitem = (BTItem) PageGetItem(page, itemid);
- if ( BTItemSame (btitem, svitem) ) {
- pfree(svitem);
- ItemPointerSet(current, blkno, offnum);
- return (false);
+ svitem = (BTItem) palloc(itemsz);
+ memmove((char *) svitem, (char *) btitem, itemsz);
+
+ if (_bt_step(scan, bufP, dir))
+ {
+ pfree(svitem);
+ return (true);
+ }
+
+ /* try to find our place again */
+ *bufP = _bt_getbuf(scan->relation, blkno, BT_READ);
+ page = BufferGetPage(*bufP);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ while (offnum <= maxoff)
+ {
+ itemid = PageGetItemId(page, offnum);
+ btitem = (BTItem) PageGetItem(page, itemid);
+ if (BTItemSame(btitem, svitem))
+ {
+ pfree(svitem);
+ ItemPointerSet(current, blkno, offnum);
+ return (false);
+ }
}
- }
-
- /*
- * XXX crash and burn -- can't find our place. We can be a little
- * smarter -- walk to the next page to the right, for example, since
- * that's the only direction that splits happen in. Deletions screw
- * us up less often since they're only done by the vacuum daemon.
- */
-
- elog(WARN, "btree synchronization error: concurrent update botched scan");
-
- return (false);
+
+ /*
+ * XXX crash and burn -- can't find our place. We can be a little
+ * smarter -- walk to the next page to the right, for example, since
+ * that's the only direction that splits happen in. Deletions screw
+ * us up less often since they're only done by the vacuum daemon.
+ */
+
+ elog(WARN, "btree synchronization error: concurrent update botched scan");
+
+ return (false);
}
/*
- * _bt_endpoint() -- Find the first or last key in the index.
+ * _bt_endpoint() -- Find the first or last key in the index.
*/
-static RetrieveIndexResult
+static RetrieveIndexResult
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
- Relation rel;
- Buffer buf;
- Page page;
- BTPageOpaque opaque;
- ItemPointer current;
- OffsetNumber offnum, maxoff;
- OffsetNumber start = 0;
- BlockNumber blkno;
- BTItem btitem;
- IndexTuple itup;
- BTScanOpaque so;
- RetrieveIndexResult res;
- Size keysok;
-
- rel = scan->relation;
- current = &(scan->currentItemData);
- so = (BTScanOpaque) scan->opaque;
-
- buf = _bt_getroot(rel, BT_READ);
- blkno = BufferGetBlockNumber(buf);
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- for (;;) {
- if (opaque->btpo_flags & BTP_LEAF)
- break;
-
- if (ScanDirectionIsForward(dir)) {
- offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
- } else {
- offnum = PageGetMaxOffsetNumber(page);
- }
-
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
- itup = &(btitem->bti_itup);
-
- blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
-
- _bt_relbuf(rel, buf, BT_READ);
- buf = _bt_getbuf(rel, blkno, BT_READ);
+ Relation rel;
+ Buffer buf;
+ Page page;
+ BTPageOpaque opaque;
+ ItemPointer current;
+ OffsetNumber offnum,
+ maxoff;
+ OffsetNumber start = 0;
+ BlockNumber blkno;
+ BTItem btitem;
+ IndexTuple itup;
+ BTScanOpaque so;
+ RetrieveIndexResult res;
+ Size keysok;
+
+ rel = scan->relation;
+ current = &(scan->currentItemData);
+ so = (BTScanOpaque) scan->opaque;
+
+ buf = _bt_getroot(rel, BT_READ);
+ blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- /*
- * Race condition: If the child page we just stepped onto is
- * in the process of being split, we need to make sure we're
- * all the way at the right edge of the tree. See the paper
- * by Lehman and Yao.
- */
-
- if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) {
- do {
- blkno = opaque->btpo_next;
+
+ for (;;)
+ {
+ if (opaque->btpo_flags & BTP_LEAF)
+ break;
+
+ if (ScanDirectionIsForward(dir))
+ {
+ offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+ }
+ else
+ {
+ offnum = PageGetMaxOffsetNumber(page);
+ }
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
+ itup = &(btitem->bti_itup);
+
+ blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_READ);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- } while (! P_RIGHTMOST(opaque));
+
+ /*
+ * Race condition: If the child page we just stepped onto is in
+ * the process of being split, we need to make sure we're all the
+ * way at the right edge of the tree. See the paper by Lehman and
+ * Yao.
+ */
+
+ if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque))
+ {
+ do
+ {
+ blkno = opaque->btpo_next;
+ _bt_relbuf(rel, buf, BT_READ);
+ buf = _bt_getbuf(rel, blkno, BT_READ);
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ } while (!P_RIGHTMOST(opaque));
+ }
}
- }
-
- /* okay, we've got the {left,right}-most page in the tree */
- maxoff = PageGetMaxOffsetNumber(page);
-
- if (ScanDirectionIsForward(dir)) {
- if ( !P_LEFTMOST(opaque) ) /* non-leftmost page ? */
- elog (WARN, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno);
- start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
- /*
- * I don't understand this stuff! It doesn't work for non-rightmost
- * pages with only one element (P_HIKEY) which we have after
- * deletion itups by vacuum (it's case of start > maxoff).
- * Scanning in BackwardScanDirection is not understandable at all.
- * Well - new stuff. - vadim 12/06/96
- */
+
+ /* okay, we've got the {left,right}-most page in the tree */
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ if (ScanDirectionIsForward(dir))
+ {
+ if (!P_LEFTMOST(opaque))/* non-leftmost page ? */
+ elog(WARN, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno);
+ start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
+
+ /*
+ * I don't understand this stuff! It doesn't work for
+ * non-rightmost pages with only one element (P_HIKEY) which we
+ * have after deletion itups by vacuum (it's case of start >
+ * maxoff). Scanning in BackwardScanDirection is not
+ * understandable at all. Well - new stuff. - vadim 12/06/96
+ */
#if 0
- if (PageIsEmpty(page) || start > maxoff) {
- ItemPointerSet(current, blkno, maxoff);
- if (!_bt_step(scan, &buf, BackwardScanDirection))
- return ((RetrieveIndexResult) NULL);
-
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
+ if (PageIsEmpty(page) || start > maxoff)
+ {
+ ItemPointerSet(current, blkno, maxoff);
+ if (!_bt_step(scan, &buf, BackwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ }
#endif
- if ( PageIsEmpty (page) )
+ if (PageIsEmpty(page))
+ {
+ if (start != P_HIKEY) /* non-rightmost page */
+ elog(WARN, "_bt_endpoint: non-rightmost page (%u) is empty", blkno);
+
+ /*
+ * It's left- & right- most page - root page, - and it's
+ * empty...
+ */
+ _bt_relbuf(rel, buf, BT_READ);
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ return ((RetrieveIndexResult) NULL);
+ }
+ if (start > maxoff) /* start == 2 && maxoff == 1 */
+ {
+ ItemPointerSet(current, blkno, maxoff);
+ if (!_bt_step(scan, &buf, ForwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ }
+ /* new stuff ends here */
+ else
+ {
+ ItemPointerSet(current, blkno, start);
+ }
+ }
+ else if (ScanDirectionIsBackward(dir))
{
- if ( start != P_HIKEY ) /* non-rightmost page */
- elog (WARN, "_bt_endpoint: non-rightmost page (%u) is empty", blkno);
- /* It's left- & right- most page - root page, - and it's empty... */
- _bt_relbuf(rel, buf, BT_READ);
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- return ((RetrieveIndexResult) NULL);
+
+ /*
+ * I don't understand this stuff too! If RIGHT-most leaf page is
+ * empty why do scanning in ForwardScanDirection ??? Well - new
+ * stuff. - vadim 12/06/96
+ */
+#if 0
+ if (PageIsEmpty(page))
+ {
+ ItemPointerSet(current, blkno, FirstOffsetNumber);
+ if (!_bt_step(scan, &buf, ForwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ }
+#endif
+ if (PageIsEmpty(page))
+ {
+ /* If it's leftmost page too - it's empty root page... */
+ if (P_LEFTMOST(opaque))
+ {
+ _bt_relbuf(rel, buf, BT_READ);
+ ItemPointerSetInvalid(current);
+ so->btso_curbuf = InvalidBuffer;
+ return ((RetrieveIndexResult) NULL);
+ }
+ /* Go back ! */
+ ItemPointerSet(current, blkno, FirstOffsetNumber);
+ if (!_bt_step(scan, &buf, BackwardScanDirection))
+ return ((RetrieveIndexResult) NULL);
+
+ start = ItemPointerGetOffsetNumber(current);
+ page = BufferGetPage(buf);
+ }
+ /* new stuff ends here */
+ else
+ {
+ start = PageGetMaxOffsetNumber(page);
+ ItemPointerSet(current, blkno, start);
+ }
}
- if ( start > maxoff ) /* start == 2 && maxoff == 1 */
+ else
{
- ItemPointerSet(current, blkno, maxoff);
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- return ((RetrieveIndexResult) NULL);
-
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
+ elog(WARN, "Illegal scan direction %d", dir);
}
- /* new stuff ends here */
- else {
- ItemPointerSet(current, blkno, start);
+
+ btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
+ itup = &(btitem->bti_itup);
+
+ /* see if we picked a winner */
+ if (_bt_checkkeys(scan, itup, &keysok))
+ {
+ res = FormRetrieveIndexResult(current, &(itup->t_tid));
+
+ /* remember which buffer we have pinned */
+ so->btso_curbuf = buf;
}
- } else if (ScanDirectionIsBackward(dir)) {
- /*
- * I don't understand this stuff too! If RIGHT-most leaf page is
- * empty why do scanning in ForwardScanDirection ???
- * Well - new stuff. - vadim 12/06/96
- */
-#if 0
- if (PageIsEmpty(page)) {
- ItemPointerSet(current, blkno, FirstOffsetNumber);
- if (!_bt_step(scan, &buf, ForwardScanDirection))
- return ((RetrieveIndexResult) NULL);
-
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
+ else if (keysok >= so->numberOfFirstKeys)
+ {
+ so->btso_curbuf = buf;
+ return (_bt_next(scan, dir));
}
-#endif
- if (PageIsEmpty(page))
+ else
{
- /* If it's leftmost page too - it's empty root page... */
- if ( P_LEFTMOST(opaque) )
- {
- _bt_relbuf(rel, buf, BT_READ);
ItemPointerSetInvalid(current);
so->btso_curbuf = InvalidBuffer;
- return ((RetrieveIndexResult) NULL);
- }
- /* Go back ! */
- ItemPointerSet(current, blkno, FirstOffsetNumber);
- if (!_bt_step(scan, &buf, BackwardScanDirection))
- return ((RetrieveIndexResult) NULL);
-
- start = ItemPointerGetOffsetNumber(current);
- page = BufferGetPage(buf);
- }
- /* new stuff ends here */
- else {
- start = PageGetMaxOffsetNumber(page);
- ItemPointerSet(current, blkno, start);
+ _bt_relbuf(rel, buf, BT_READ);
+ res = (RetrieveIndexResult) NULL;
}
- } else {
- elog(WARN, "Illegal scan direction %d", dir);
- }
-
- btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
- itup = &(btitem->bti_itup);
-
- /* see if we picked a winner */
- if ( _bt_checkkeys (scan, itup, &keysok) )
- {
- res = FormRetrieveIndexResult(current, &(itup->t_tid));
-
- /* remember which buffer we have pinned */
- so->btso_curbuf = buf;
- }
- else if ( keysok >= so->numberOfFirstKeys )
- {
- so->btso_curbuf = buf;
- return (_bt_next (scan, dir));
- }
- else
- {
- ItemPointerSetInvalid(current);
- so->btso_curbuf = InvalidBuffer;
- _bt_relbuf(rel, buf, BT_READ);
- res = (RetrieveIndexResult) NULL;
- }
-
- return (res);
+
+ return (res);
}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 8e054d24abf..09cb43769f2 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -5,30 +5,30 @@
*
*
* IDENTIFICATION
- * $Id: nbtsort.c,v 1.19 1997/08/19 21:29:46 momjian Exp $
+ * $Id: nbtsort.c,v 1.20 1997/09/07 04:39:02 momjian Exp $
*
* NOTES
*
* what we do is:
* - generate a set of initial one-block runs, distributed round-robin
- * between the output tapes.
+ * between the output tapes.
* - for each pass,
- * - swap input and output tape sets, rewinding both and truncating
- * the output tapes.
- * - merge the current run in each input tape to the current output
- * tape.
- * - when each input run has been exhausted, switch to another output
- * tape and start processing another run.
+ * - swap input and output tape sets, rewinding both and truncating
+ * the output tapes.
+ * - merge the current run in each input tape to the current output
+ * tape.
+ * - when each input run has been exhausted, switch to another output
+ * tape and start processing another run.
* - when we have fewer runs than tapes, we know we are ready to start
- * merging into the btree leaf pages. (i.e., we do not have to wait
- * until we have exactly one tape.)
+ * merging into the btree leaf pages. (i.e., we do not have to wait
+ * until we have exactly one tape.)
* - as we extract tuples from the final runs, we build the pages for
- * each level. when we have only one page on a level, it must be the
- * root -- it can be attached to the btree metapage and we are done.
+ * each level. when we have only one page on a level, it must be the
+ * root -- it can be attached to the btree metapage and we are done.
*
* conventions:
* - external interface routines take in and return "void *" for their
- * opaque handles. this is for modularity reasons.
+ * opaque handles. this is for modularity reasons.
*
* this code is moderately slow (~10% slower) compared to the regular
* btree (insertion) build code on sorted or well-clustered data. on
@@ -58,20 +58,21 @@
#ifndef HAVE_MEMMOVE
-# include <regex/utils.h>
+#include <regex/utils.h>
#else
-# include <string.h>
+#include <string.h>
#endif
#ifdef BTREE_BUILD_STATS
#include <tcop/tcopprot.h>
-extern int ShowExecutorStats;
+extern int ShowExecutorStats;
+
#endif
-static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
-static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
-static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
-static void _bt_uppershutdown(Relation index, BTPageState *state);
+static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags);
+static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
+static void *_bt_pagestate(Relation index, int flags, int level, bool doupper);
+static void _bt_uppershutdown(Relation index, BTPageState * state);
/*
* turn on debugging output.
@@ -83,18 +84,18 @@ static void _bt_uppershutdown(Relation index, BTPageState *state);
#define FASTBUILD_SPOOL
#define FASTBUILD_MERGE
-#define MAXTAPES (7)
-#define TAPEBLCKSZ (MAXBLCKSZ << 2)
-#define TAPETEMP "pg_btsortXXXXXX"
+#define MAXTAPES (7)
+#define TAPEBLCKSZ (MAXBLCKSZ << 2)
+#define TAPETEMP "pg_btsortXXXXXX"
-extern int NDirectFileRead;
-extern int NDirectFileWrite;
-extern char *mktemp(char *template);
+extern int NDirectFileRead;
+extern int NDirectFileWrite;
+extern char *mktemp(char *template);
/*
- * this is what we use to shovel BTItems in and out of memory. it's
+ * this is what we use to shovel BTItems in and out of memory. it's
* bigger than a standard block because we are doing a lot of strictly
- * sequential i/o. this is obviously something of a tradeoff since we
+ * sequential i/o. this is obviously something of a tradeoff since we
* are potentially reading a bunch of zeroes off of disk in many
* cases.
*
@@ -104,14 +105,15 @@ extern char *mktemp(char *template);
* the only thing like that so i'm not going to worry about wasting a
* few bytes.
*/
-typedef struct {
- int bttb_magic; /* magic number */
- int bttb_fd; /* file descriptor */
- int bttb_top; /* top of free space within bttb_data */
- short bttb_ntup; /* number of tuples in this block */
- short bttb_eor; /* End-Of-Run marker */
- char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
-} BTTapeBlock;
+typedef struct
+{
+ int bttb_magic; /* magic number */
+ int bttb_fd; /* file descriptor */
+ int bttb_top; /* top of free space within bttb_data */
+ short bttb_ntup; /* number of tuples in this block */
+ short bttb_eor; /* End-Of-Run marker */
+ char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)];
+} BTTapeBlock;
/*
* this structure holds the bookkeeping for a simple balanced multiway
@@ -120,13 +122,14 @@ typedef struct {
* right now. though if psort was in a condition that i could hack it
* to do this, you bet i would.)
*/
-typedef struct {
- int bts_ntapes;
- int bts_tape;
- BTTapeBlock **bts_itape; /* input tape blocks */
- BTTapeBlock **bts_otape; /* output tape blocks */
- bool isunique;
-} BTSpool;
+typedef struct
+{
+ int bts_ntapes;
+ int bts_tape;
+ BTTapeBlock **bts_itape; /* input tape blocks */
+ BTTapeBlock **bts_otape; /* output tape blocks */
+ bool isunique;
+} BTSpool;
/*-------------------------------------------------------------------------
* sorting comparison routine - returns {-1,0,1} depending on whether
@@ -146,101 +149,102 @@ typedef struct {
* what the heck.
* *-------------------------------------------------------------------------
*/
-typedef struct {
- Datum *btsk_datum;
- char *btsk_nulls;
- BTItem btsk_item;
-} BTSortKey;
+typedef struct
+{
+ Datum *btsk_datum;
+ char *btsk_nulls;
+ BTItem btsk_item;
+} BTSortKey;
static Relation _bt_sortrel;
-static int _bt_nattr;
-static BTSpool * _bt_inspool;
+static int _bt_nattr;
+static BTSpool *_bt_inspool;
static void
-_bt_isortcmpinit(Relation index, BTSpool *spool)
+_bt_isortcmpinit(Relation index, BTSpool * spool)
{
- _bt_sortrel = index;
- _bt_inspool = spool;
- _bt_nattr = index->rd_att->natts;
+ _bt_sortrel = index;
+ _bt_inspool = spool;
+ _bt_nattr = index->rd_att->natts;
}
static int
-_bt_isortcmp(BTSortKey *k1, BTSortKey *k2)
+_bt_isortcmp(BTSortKey * k1, BTSortKey * k2)
{
- Datum *k1_datum = k1->btsk_datum;
- Datum *k2_datum = k2->btsk_datum;
- char *k1_nulls = k1->btsk_nulls;
- char *k2_nulls = k2->btsk_nulls;
- bool equal_isnull = false;
- int i;
-
- if (k1->btsk_item == (BTItem) NULL)
- {
- if (k2->btsk_item == (BTItem) NULL)
- return(0); /* 1 = 2 */
- return(1); /* 1 > 2 */
- }
- else if (k2->btsk_item == (BTItem) NULL)
- return(-1); /* 1 < 2 */
-
- for (i = 0; i < _bt_nattr; i++)
- {
- if ( k1_nulls[i] != ' ' ) /* k1 attr is NULL */
+ Datum *k1_datum = k1->btsk_datum;
+ Datum *k2_datum = k2->btsk_datum;
+ char *k1_nulls = k1->btsk_nulls;
+ char *k2_nulls = k2->btsk_nulls;
+ bool equal_isnull = false;
+ int i;
+
+ if (k1->btsk_item == (BTItem) NULL)
{
- if ( k2_nulls[i] != ' ' ) /* the same for k2 */
- {
- equal_isnull = true;
- continue;
- }
- return (1); /* NULL ">" NOT_NULL */
+ if (k2->btsk_item == (BTItem) NULL)
+ return (0); /* 1 = 2 */
+ return (1); /* 1 > 2 */
}
- else if ( k2_nulls[i] != ' ' ) /* k2 attr is NULL */
- return (-1); /* NOT_NULL "<" NULL */
-
- if (_bt_invokestrat(_bt_sortrel, i+1, BTGreaterStrategyNumber,
- k1_datum[i], k2_datum[i]))
- return(1); /* 1 > 2 */
- else if (_bt_invokestrat(_bt_sortrel, i+1, BTGreaterStrategyNumber,
- k2_datum[i], k1_datum[i]))
- return(-1); /* 1 < 2 */
- }
-
- if ( _bt_inspool->isunique && !equal_isnull )
- {
- _bt_spooldestroy ((void*)_bt_inspool);
- elog (WARN, "Cannot create unique index. Table contains non-unique values");
- }
- return(0); /* 1 = 2 */
+ else if (k2->btsk_item == (BTItem) NULL)
+ return (-1); /* 1 < 2 */
+
+ for (i = 0; i < _bt_nattr; i++)
+ {
+ if (k1_nulls[i] != ' ') /* k1 attr is NULL */
+ {
+ if (k2_nulls[i] != ' ') /* the same for k2 */
+ {
+ equal_isnull = true;
+ continue;
+ }
+ return (1); /* NULL ">" NOT_NULL */
+ }
+ else if (k2_nulls[i] != ' ') /* k2 attr is NULL */
+ return (-1); /* NOT_NULL "<" NULL */
+
+ if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
+ k1_datum[i], k2_datum[i]))
+ return (1); /* 1 > 2 */
+ else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber,
+ k2_datum[i], k1_datum[i]))
+ return (-1); /* 1 < 2 */
+ }
+
+ if (_bt_inspool->isunique && !equal_isnull)
+ {
+ _bt_spooldestroy((void *) _bt_inspool);
+ elog(WARN, "Cannot create unique index. Table contains non-unique values");
+ }
+ return (0); /* 1 = 2 */
}
static void
-_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
+_bt_setsortkey(Relation index, BTItem bti, BTSortKey * sk)
{
- sk->btsk_item = (BTItem) NULL;
- sk->btsk_datum = (Datum*) NULL;
- sk->btsk_nulls = (char*) NULL;
-
- if (bti != (BTItem) NULL)
- {
- IndexTuple it = &(bti->bti_itup);
- TupleDesc itdesc = index->rd_att;
- Datum *dp = (Datum*) palloc (_bt_nattr * sizeof (Datum));
- char *np = (char*) palloc (_bt_nattr * sizeof (char));
- bool isnull;
- int i;
-
- for (i = 0; i < _bt_nattr; i++)
- {
- dp[i] = index_getattr(it, i+1, itdesc, &isnull);
- if ( isnull )
- np[i] = 'n';
- else
- np[i] = ' ';
+ sk->btsk_item = (BTItem) NULL;
+ sk->btsk_datum = (Datum *) NULL;
+ sk->btsk_nulls = (char *) NULL;
+
+ if (bti != (BTItem) NULL)
+ {
+ IndexTuple it = &(bti->bti_itup);
+ TupleDesc itdesc = index->rd_att;
+ Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum));
+ char *np = (char *) palloc(_bt_nattr * sizeof(char));
+ bool isnull;
+ int i;
+
+ for (i = 0; i < _bt_nattr; i++)
+ {
+ dp[i] = index_getattr(it, i + 1, itdesc, &isnull);
+ if (isnull)
+ np[i] = 'n';
+ else
+ np[i] = ' ';
+ }
+ sk->btsk_item = bti;
+ sk->btsk_datum = dp;
+ sk->btsk_nulls = np;
}
- sk->btsk_item = bti;
- sk->btsk_datum = dp;
- sk->btsk_nulls = np;
- }
}
/*-------------------------------------------------------------------------
@@ -254,84 +258,100 @@ _bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk)
* XXX these probably ought to be generic library functions.
*-------------------------------------------------------------------------
*/
-typedef struct {
- int btpqe_tape; /* tape identifier */
- BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */
-} BTPriQueueElem;
-
-#define MAXELEM MAXTAPES
-typedef struct {
- int btpq_nelem;
- BTPriQueueElem btpq_queue[MAXELEM];
- Relation btpq_rel;
-} BTPriQueue;
+typedef struct
+{
+ int btpqe_tape; /* tape identifier */
+ BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */
+} BTPriQueueElem;
+
+#define MAXELEM MAXTAPES
+typedef struct
+{
+ int btpq_nelem;
+ BTPriQueueElem btpq_queue[MAXELEM];
+ Relation btpq_rel;
+} BTPriQueue;
/* be sure to call _bt_isortcmpinit first */
#define GREATER(a, b) \
- (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
+ (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0)
static void
-_bt_pqsift(BTPriQueue *q, int parent)
+_bt_pqsift(BTPriQueue * q, int parent)
{
- int child;
- BTPriQueueElem e;
-
- for (child = parent * 2 + 1;
- child < q->btpq_nelem;
- child = parent * 2 + 1) {
- if (child < q->btpq_nelem - 1) {
- if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) {
- ++child;
- }
- }
- if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) {
- e = q->btpq_queue[child]; /* struct = */
- q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
- q->btpq_queue[parent] = e; /* struct = */
- parent = child;
- } else {
- parent = child + 1;
+ int child;
+ BTPriQueueElem e;
+
+ for (child = parent * 2 + 1;
+ child < q->btpq_nelem;
+ child = parent * 2 + 1)
+ {
+ if (child < q->btpq_nelem - 1)
+ {
+ if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1])))
+ {
+ ++child;
+ }
+ }
+ if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child])))
+ {
+ e = q->btpq_queue[child]; /* struct = */
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ q->btpq_queue[parent] = e; /* struct = */
+ parent = child;
+ }
+ else
+ {
+ parent = child + 1;
+ }
}
- }
}
static int
-_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e)
+_bt_pqnext(BTPriQueue * q, BTPriQueueElem * e)
{
- if (q->btpq_nelem < 1) { /* already empty */
- return(-1);
- }
- *e = q->btpq_queue[0]; /* struct = */
-
- if (--q->btpq_nelem < 1) { /* now empty, don't sift */
- return(0);
- }
- q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
- _bt_pqsift(q, 0);
- return(0);
+ if (q->btpq_nelem < 1)
+ { /* already empty */
+ return (-1);
+ }
+ *e = q->btpq_queue[0]; /* struct = */
+
+ if (--q->btpq_nelem < 1)
+ { /* now empty, don't sift */
+ return (0);
+ }
+ q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */
+ _bt_pqsift(q, 0);
+ return (0);
}
static void
-_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
+_bt_pqadd(BTPriQueue * q, BTPriQueueElem * e)
{
- int child, parent;
-
- if (q->btpq_nelem >= MAXELEM) {
- elog(WARN, "_bt_pqadd: queue overflow");
- }
-
- child = q->btpq_nelem++;
- while (child > 0) {
- parent = child / 2;
- if (GREATER(e, &(q->btpq_queue[parent]))) {
- break;
- } else {
- q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
- child = parent;
+ int child,
+ parent;
+
+ if (q->btpq_nelem >= MAXELEM)
+ {
+ elog(WARN, "_bt_pqadd: queue overflow");
+ }
+
+ child = q->btpq_nelem++;
+ while (child > 0)
+ {
+ parent = child / 2;
+ if (GREATER(e, &(q->btpq_queue[parent])))
+ {
+ break;
+ }
+ else
+ {
+ q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */
+ child = parent;
+ }
}
- }
- q->btpq_queue[child] = *e; /* struct = */
+ q->btpq_queue[child] = *e; /* struct = */
}
/*-------------------------------------------------------------------------
@@ -339,37 +359,37 @@ _bt_pqadd(BTPriQueue *q, BTPriQueueElem *e)
*-------------------------------------------------------------------------
*/
-#define BTITEMSZ(btitem) \
- ((btitem) ? \
- (IndexTupleDSize((btitem)->bti_itup) + \
- (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
- 0)
-#define SPCLEFT(tape) \
- (sizeof((tape)->bttb_data) - (tape)->bttb_top)
-#define EMPTYTAPE(tape) \
- ((tape)->bttb_ntup <= 0)
-#define BTTAPEMAGIC 0x19660226
+#define BTITEMSZ(btitem) \
+ ((btitem) ? \
+ (IndexTupleDSize((btitem)->bti_itup) + \
+ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \
+ 0)
+#define SPCLEFT(tape) \
+ (sizeof((tape)->bttb_data) - (tape)->bttb_top)
+#define EMPTYTAPE(tape) \
+ ((tape)->bttb_ntup <= 0)
+#define BTTAPEMAGIC 0x19660226
/*
* reset the tape header for its next use without doing anything to
- * the physical tape file. (setting bttb_top to 0 makes the block
+ * the physical tape file. (setting bttb_top to 0 makes the block
* empty.)
*/
static void
-_bt_tapereset(BTTapeBlock *tape)
+_bt_tapereset(BTTapeBlock * tape)
{
- tape->bttb_eor = 0;
- tape->bttb_top = 0;
- tape->bttb_ntup = 0;
+ tape->bttb_eor = 0;
+ tape->bttb_top = 0;
+ tape->bttb_ntup = 0;
}
/*
* rewind the physical tape file.
*/
static void
-_bt_taperewind(BTTapeBlock *tape)
+_bt_taperewind(BTTapeBlock * tape)
{
- FileSeek(tape->bttb_fd, 0, SEEK_SET);
+ FileSeek(tape->bttb_fd, 0, SEEK_SET);
}
/*
@@ -382,17 +402,17 @@ _bt_taperewind(BTTapeBlock *tape)
* least you don't have to delete and reinsert the directory entries.
*/
static void
-_bt_tapeclear(BTTapeBlock *tape)
+_bt_tapeclear(BTTapeBlock * tape)
{
- /* blow away the contents of the old file */
- _bt_taperewind(tape);
+ /* blow away the contents of the old file */
+ _bt_taperewind(tape);
#if 0
- FileSync(tape->bttb_fd);
+ FileSync(tape->bttb_fd);
#endif
- FileTruncate(tape->bttb_fd, 0);
+ FileTruncate(tape->bttb_fd, 0);
- /* reset the buffer */
- _bt_tapereset(tape);
+ /* reset the buffer */
+ _bt_tapereset(tape);
}
/*
@@ -402,43 +422,44 @@ _bt_tapeclear(BTTapeBlock *tape)
static BTTapeBlock *
_bt_tapecreate(char *fname)
{
- BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
+ BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock));
- if (tape == (BTTapeBlock *) NULL) {
- elog(WARN, "_bt_tapecreate: out of memory");
- }
+ if (tape == (BTTapeBlock *) NULL)
+ {
+ elog(WARN, "_bt_tapecreate: out of memory");
+ }
- tape->bttb_magic = BTTAPEMAGIC;
+ tape->bttb_magic = BTTAPEMAGIC;
- tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
- Assert(tape->bttb_fd >= 0);
+ tape->bttb_fd = FileNameOpenFile(fname, O_RDWR | O_CREAT | O_TRUNC, 0600);
+ Assert(tape->bttb_fd >= 0);
- /* initialize the buffer */
- _bt_tapereset(tape);
+ /* initialize the buffer */
+ _bt_tapereset(tape);
- return(tape);
+ return (tape);
}
/*
* destroy the BTTapeBlock structure and its physical tape file.
*/
static void
-_bt_tapedestroy(BTTapeBlock *tape)
+_bt_tapedestroy(BTTapeBlock * tape)
{
- FileUnlink(tape->bttb_fd);
- pfree((void *) tape);
+ FileUnlink(tape->bttb_fd);
+ pfree((void *) tape);
}
/*
* flush the tape block to the file, marking End-Of-Run if requested.
*/
static void
-_bt_tapewrite(BTTapeBlock *tape, int eor)
+_bt_tapewrite(BTTapeBlock * tape, int eor)
{
- tape->bttb_eor = eor;
- FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
- NDirectFileWrite += TAPEBLCKSZ/MAXBLCKSZ;
- _bt_tapereset(tape);
+ tape->bttb_eor = eor;
+ FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ);
+ NDirectFileWrite += TAPEBLCKSZ / MAXBLCKSZ;
+ _bt_tapereset(tape);
}
/*
@@ -447,34 +468,36 @@ _bt_tapewrite(BTTapeBlock *tape, int eor)
*
* returns:
* - 0 if there are no more blocks in the tape or in this run (call
- * _bt_tapereset to clear the End-Of-Run marker)
+ * _bt_tapereset to clear the End-Of-Run marker)
* - 1 if a valid block was read
*/
static int
-_bt_taperead(BTTapeBlock *tape)
+_bt_taperead(BTTapeBlock * tape)
{
- int fd;
- int nread;
-
- if (tape->bttb_eor) {
- return(0); /* we are already at End-Of-Run */
- }
-
- /*
- * we're clobbering the old tape block, but we do need to save the
- * VFD (the one in the block we're reading is bogus).
- */
- fd = tape->bttb_fd;
- nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
- tape->bttb_fd = fd;
-
- if (nread != TAPEBLCKSZ) {
- Assert(nread == 0); /* we are at EOF */
- return(0);
- }
- Assert(tape->bttb_magic == BTTAPEMAGIC);
- NDirectFileRead += TAPEBLCKSZ/MAXBLCKSZ;
- return(1);
+ int fd;
+ int nread;
+
+ if (tape->bttb_eor)
+ {
+ return (0); /* we are already at End-Of-Run */
+ }
+
+ /*
+ * we're clobbering the old tape block, but we do need to save the VFD
+ * (the one in the block we're reading is bogus).
+ */
+ fd = tape->bttb_fd;
+ nread = FileRead(fd, (char *) tape, TAPEBLCKSZ);
+ tape->bttb_fd = fd;
+
+ if (nread != TAPEBLCKSZ)
+ {
+ Assert(nread == 0); /* we are at EOF */
+ return (0);
+ }
+ Assert(tape->bttb_magic == BTTAPEMAGIC);
+ NDirectFileRead += TAPEBLCKSZ / MAXBLCKSZ;
+ return (1);
}
/*
@@ -487,19 +510,20 @@ _bt_taperead(BTTapeBlock *tape)
* side effects:
* - sets 'pos' to the current position within the block.
*/
-static BTItem
-_bt_tapenext(BTTapeBlock *tape, char **pos)
+static BTItem
+_bt_tapenext(BTTapeBlock * tape, char **pos)
{
- Size itemsz;
- BTItem bti;
-
- if (*pos >= tape->bttb_data + tape->bttb_top) {
- return((BTItem) NULL);
- }
- bti = (BTItem) *pos;
- itemsz = BTITEMSZ(bti);
- *pos += DOUBLEALIGN(itemsz);
- return(bti);
+ Size itemsz;
+ BTItem bti;
+
+ if (*pos >= tape->bttb_data + tape->bttb_top)
+ {
+ return ((BTItem) NULL);
+ }
+ bti = (BTItem) * pos;
+ itemsz = BTITEMSZ(bti);
+ *pos += DOUBLEALIGN(itemsz);
+ return (bti);
}
/*
@@ -514,11 +538,11 @@ _bt_tapenext(BTTapeBlock *tape, char **pos)
* the beginning of free space.
*/
static void
-_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
+_bt_tapeadd(BTTapeBlock * tape, BTItem item, int itemsz)
{
- memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
- ++tape->bttb_ntup;
- tape->bttb_top += DOUBLEALIGN(itemsz);
+ memcpy(tape->bttb_data + tape->bttb_top, item, itemsz);
+ ++tape->bttb_ntup;
+ tape->bttb_top += DOUBLEALIGN(itemsz);
}
/*-------------------------------------------------------------------------
@@ -530,41 +554,44 @@ _bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz)
* create and initialize a spool structure, including the underlying
* files.
*/
-void *
+void *
_bt_spoolinit(Relation index, int ntapes, bool isunique)
{
- BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
- int i;
- char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
-
- if (btspool == (BTSpool *) NULL || fname == (char *) NULL) {
- elog(WARN, "_bt_spoolinit: out of memory");
- }
- memset((char *) btspool, 0, sizeof(BTSpool));
- btspool->bts_ntapes = ntapes;
- btspool->bts_tape = 0;
- btspool->isunique = isunique;
-
- btspool->bts_itape =
- (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
- btspool->bts_otape =
- (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
- if (btspool->bts_itape == (BTTapeBlock **) NULL ||
- btspool->bts_otape == (BTTapeBlock **) NULL) {
- elog(WARN, "_bt_spoolinit: out of memory");
- }
-
- for (i = 0; i < ntapes; ++i) {
- btspool->bts_itape[i] =
- _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
- btspool->bts_otape[i] =
- _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
- }
- pfree((void *) fname);
-
- _bt_isortcmpinit(index, btspool);
-
- return((void *) btspool);
+ BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool));
+ int i;
+ char *fname = (char *) palloc(sizeof(TAPETEMP) + 1);
+
+ if (btspool == (BTSpool *) NULL || fname == (char *) NULL)
+ {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+ memset((char *) btspool, 0, sizeof(BTSpool));
+ btspool->bts_ntapes = ntapes;
+ btspool->bts_tape = 0;
+ btspool->isunique = isunique;
+
+ btspool->bts_itape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ btspool->bts_otape =
+ (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes);
+ if (btspool->bts_itape == (BTTapeBlock **) NULL ||
+ btspool->bts_otape == (BTTapeBlock **) NULL)
+ {
+ elog(WARN, "_bt_spoolinit: out of memory");
+ }
+
+ for (i = 0; i < ntapes; ++i)
+ {
+ btspool->bts_itape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ btspool->bts_otape[i] =
+ _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP)));
+ }
+ pfree((void *) fname);
+
+ _bt_isortcmpinit(index, btspool);
+
+ return ((void *) btspool);
}
/*
@@ -573,29 +600,32 @@ _bt_spoolinit(Relation index, int ntapes, bool isunique)
void
_bt_spooldestroy(void *spool)
{
- BTSpool *btspool = (BTSpool *) spool;
- int i;
-
- for (i = 0; i < btspool->bts_ntapes; ++i) {
- _bt_tapedestroy(btspool->bts_otape[i]);
- _bt_tapedestroy(btspool->bts_itape[i]);
- }
- pfree((void *) btspool);
+ BTSpool *btspool = (BTSpool *) spool;
+ int i;
+
+ for (i = 0; i < btspool->bts_ntapes; ++i)
+ {
+ _bt_tapedestroy(btspool->bts_otape[i]);
+ _bt_tapedestroy(btspool->bts_itape[i]);
+ }
+ pfree((void *) btspool);
}
/*
* flush out any dirty output tape blocks
*/
static void
-_bt_spoolflush(BTSpool *btspool)
+_bt_spoolflush(BTSpool * btspool)
{
- int i;
+ int i;
- for (i = 0; i < btspool->bts_ntapes; ++i) {
- if (!EMPTYTAPE(btspool->bts_otape[i])) {
- _bt_tapewrite(btspool->bts_otape[i], 1);
+ for (i = 0; i < btspool->bts_ntapes; ++i)
+ {
+ if (!EMPTYTAPE(btspool->bts_otape[i]))
+ {
+ _bt_tapewrite(btspool->bts_otape[i], 1);
+ }
}
- }
}
/*
@@ -605,36 +635,37 @@ _bt_spoolflush(BTSpool *btspool)
* output tapes.
*/
static void
-_bt_spoolswap(BTSpool *btspool)
+_bt_spoolswap(BTSpool * btspool)
{
- File tmpfd;
- BTTapeBlock *itape;
- BTTapeBlock *otape;
- int i;
+ File tmpfd;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ int i;
- for (i = 0; i < btspool->bts_ntapes; ++i) {
- itape = btspool->bts_itape[i];
- otape = btspool->bts_otape[i];
+ for (i = 0; i < btspool->bts_ntapes; ++i)
+ {
+ itape = btspool->bts_itape[i];
+ otape = btspool->bts_otape[i];
- /*
- * swap the input and output VFDs.
- */
- tmpfd = itape->bttb_fd;
- itape->bttb_fd = otape->bttb_fd;
- otape->bttb_fd = tmpfd;
+ /*
+ * swap the input and output VFDs.
+ */
+ tmpfd = itape->bttb_fd;
+ itape->bttb_fd = otape->bttb_fd;
+ otape->bttb_fd = tmpfd;
- /*
- * rewind the new input tape.
- */
- _bt_taperewind(itape);
- _bt_tapereset(itape);
+ /*
+ * rewind the new input tape.
+ */
+ _bt_taperewind(itape);
+ _bt_tapereset(itape);
- /*
- * clear the new output tape -- it's ok to throw away the old
- * inputs.
- */
- _bt_tapeclear(otape);
- }
+ /*
+ * clear the new output tape -- it's ok to throw away the old
+ * inputs.
+ */
+ _bt_tapeclear(otape);
+ }
}
/*-------------------------------------------------------------------------
@@ -643,7 +674,7 @@ _bt_spoolswap(BTSpool *btspool)
*/
/*
- * spool 'btitem' into an initial run. as tape blocks are filled, the
+ * spool 'btitem' into an initial run. as tape blocks are filled, the
* block BTItems are qsorted and written into some output tape (it
* doesn't matter which; we go round-robin for simplicity). the
* initial runs are therefore always just one block.
@@ -651,134 +682,137 @@ _bt_spoolswap(BTSpool *btspool)
void
_bt_spool(Relation index, BTItem btitem, void *spool)
{
- BTSpool *btspool = (BTSpool *) spool;
- BTTapeBlock *itape;
- Size itemsz;
-
- _bt_isortcmpinit (index, btspool);
-
- itape = btspool->bts_itape[btspool->bts_tape];
- itemsz = BTITEMSZ(btitem);
- itemsz = DOUBLEALIGN(itemsz);
-
- /*
- * if this buffer is too full for this BTItemData, or if we have
- * run out of BTItems, we need to sort the buffer and write it
- * out. in this case, the BTItemData will go into the next tape's
- * buffer.
- */
- if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) {
- BTSortKey *parray = (BTSortKey *) NULL;
- BTTapeBlock *otape;
- BTItem bti;
- char *pos;
- int btisz;
- int it_ntup = itape->bttb_ntup;
- int i;
+ BTSpool *btspool = (BTSpool *) spool;
+ BTTapeBlock *itape;
+ Size itemsz;
- /*
- * build an array of pointers to the BTItemDatas on the input
- * block.
- */
- if (it_ntup > 0) {
- parray =
- (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
- pos = itape->bttb_data;
- for (i = 0; i < it_ntup; ++i) {
- _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
- }
-
- /*
- * qsort the pointer array.
- */
- qsort((void *) parray, it_ntup, sizeof(BTSortKey),
- (int (*)(const void *,const void *))_bt_isortcmp);
- }
+ _bt_isortcmpinit(index, btspool);
+
+ itape = btspool->bts_itape[btspool->bts_tape];
+ itemsz = BTITEMSZ(btitem);
+ itemsz = DOUBLEALIGN(itemsz);
/*
- * write the spooled run into the output tape. we copy the
- * BTItemDatas in the order dictated by the sorted array of
- * BTItems, not the original order.
- *
- * (since everything was DOUBLEALIGN'd and is all on a single
- * tape block, everything had *better* still fit on one tape
- * block..)
+ * if this buffer is too full for this BTItemData, or if we have run
+ * out of BTItems, we need to sort the buffer and write it out. in
+ * this case, the BTItemData will go into the next tape's buffer.
*/
- otape = btspool->bts_otape[btspool->bts_tape];
- for (i = 0; i < it_ntup; ++i) {
- bti = parray[i].btsk_item;
- btisz = BTITEMSZ(bti);
- btisz = DOUBLEALIGN(btisz);
- _bt_tapeadd(otape, bti, btisz);
+ if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz)
+ {
+ BTSortKey *parray = (BTSortKey *) NULL;
+ BTTapeBlock *otape;
+ BTItem bti;
+ char *pos;
+ int btisz;
+ int it_ntup = itape->bttb_ntup;
+ int i;
+
+ /*
+ * build an array of pointers to the BTItemDatas on the input
+ * block.
+ */
+ if (it_ntup > 0)
+ {
+ parray =
+ (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey));
+ pos = itape->bttb_data;
+ for (i = 0; i < it_ntup; ++i)
+ {
+ _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i]));
+ }
+
+ /*
+ * qsort the pointer array.
+ */
+ qsort((void *) parray, it_ntup, sizeof(BTSortKey),
+ (int (*) (const void *, const void *)) _bt_isortcmp);
+ }
+
+ /*
+ * write the spooled run into the output tape. we copy the
+ * BTItemDatas in the order dictated by the sorted array of
+ * BTItems, not the original order.
+ *
+ * (since everything was DOUBLEALIGN'd and is all on a single tape
+ * block, everything had *better* still fit on one tape block..)
+ */
+ otape = btspool->bts_otape[btspool->bts_tape];
+ for (i = 0; i < it_ntup; ++i)
+ {
+ bti = parray[i].btsk_item;
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ _bt_tapeadd(otape, bti, btisz);
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL)
- {
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
- &isnull);
- printf("_bt_spool: inserted <%x> into output tape %d\n",
- d, btspool->bts_tape);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
- }
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att,
+ &isnull);
- /*
- * the initial runs are always single tape blocks. flush the
- * output block, marking End-Of-Run.
- */
- _bt_tapewrite(otape, 1);
+ printf("_bt_spool: inserted <%x> into output tape %d\n",
+ d, btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */
+ }
- /*
- * reset the input buffer for the next run. we don't have to
- * write it out or anything -- we only use it to hold the
- * unsorted BTItemDatas, the output tape contains all the
- * sorted stuff.
- *
- * changing bts_tape changes the output tape and input tape;
- * we change itape for the code below.
- */
- _bt_tapereset(itape);
- btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
- itape = btspool->bts_itape[btspool->bts_tape];
+ /*
+ * the initial runs are always single tape blocks. flush the
+ * output block, marking End-Of-Run.
+ */
+ _bt_tapewrite(otape, 1);
- /*
- * destroy the pointer array.
- */
- if (parray != (BTSortKey *) NULL)
- {
- for (i = 0; i < it_ntup; i++)
- {
- if ( parray[i].btsk_datum != (Datum*) NULL )
- pfree ((void*)(parray[i].btsk_datum));
- if ( parray[i].btsk_nulls != (char*) NULL )
- pfree ((void*)(parray[i].btsk_nulls));
- }
- pfree((void *) parray);
+ /*
+ * reset the input buffer for the next run. we don't have to
+ * write it out or anything -- we only use it to hold the unsorted
+ * BTItemDatas, the output tape contains all the sorted stuff.
+ *
+ * changing bts_tape changes the output tape and input tape; we
+ * change itape for the code below.
+ */
+ _bt_tapereset(itape);
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ itape = btspool->bts_itape[btspool->bts_tape];
+
+ /*
+ * destroy the pointer array.
+ */
+ if (parray != (BTSortKey *) NULL)
+ {
+ for (i = 0; i < it_ntup; i++)
+ {
+ if (parray[i].btsk_datum != (Datum *) NULL)
+ pfree((void *) (parray[i].btsk_datum));
+ if (parray[i].btsk_nulls != (char *) NULL)
+ pfree((void *) (parray[i].btsk_nulls));
+ }
+ pfree((void *) parray);
+ }
}
- }
- /* insert this item into the current buffer */
- if (btitem != (BTItem) NULL) {
- _bt_tapeadd(itape, btitem, itemsz);
- }
+ /* insert this item into the current buffer */
+ if (btitem != (BTItem) NULL)
+ {
+ _bt_tapeadd(itape, btitem, itemsz);
+ }
}
/*
* allocate a new, clean btree page, not linked to any siblings.
*/
static void
-_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
+_bt_blnewpage(Relation index, Buffer * buf, Page * page, int flags)
{
- BTPageOpaque opaque;
+ BTPageOpaque opaque;
- *buf = _bt_getbuf(index, P_NEW, BT_WRITE);
+ *buf = _bt_getbuf(index, P_NEW, BT_WRITE);
#if 0
- printf("\tblk=%d\n", BufferGetBlockNumber(*buf));
+ printf("\tblk=%d\n", BufferGetBlockNumber(*buf));
#endif
- *page = BufferGetPage(*buf);
- _bt_pageinit(*page, BufferGetPageSize(*buf));
- opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
- opaque->btpo_prev = opaque->btpo_next = P_NONE;
- opaque->btpo_flags = flags;
+ *page = BufferGetPage(*buf);
+ _bt_pageinit(*page, BufferGetPageSize(*buf));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
+ opaque->btpo_prev = opaque->btpo_next = P_NONE;
+ opaque->btpo_flags = flags;
}
/*
@@ -790,42 +824,44 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
static void
_bt_slideleft(Relation index, Buffer buf, Page page)
{
- OffsetNumber off;
- OffsetNumber maxoff;
- ItemId previi;
- ItemId thisii;
-
- if (!PageIsEmpty(page)) {
- maxoff = PageGetMaxOffsetNumber(page);
- previi = PageGetItemId(page, P_HIKEY);
- for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) {
- thisii = PageGetItemId(page, off);
- *previi = *thisii;
- previi = thisii;
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ ItemId previi;
+ ItemId thisii;
+
+ if (!PageIsEmpty(page))
+ {
+ maxoff = PageGetMaxOffsetNumber(page);
+ previi = PageGetItemId(page, P_HIKEY);
+ for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
+ {
+ thisii = PageGetItemId(page, off);
+ *previi = *thisii;
+ previi = thisii;
+ }
+ ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
}
- ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
- }
}
/*
* allocate and initialize a new BTPageState. the returned structure
* is suitable for immediate use by _bt_buildadd.
*/
-static void *
+static void *
_bt_pagestate(Relation index, int flags, int level, bool doupper)
{
- BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
-
- memset((char *) state, 0, sizeof(BTPageState));
- _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
- state->btps_firstoff = InvalidOffsetNumber;
- state->btps_lastoff = P_HIKEY;
- state->btps_lastbti = (BTItem) NULL;
- state->btps_next = (BTPageState *) NULL;
- state->btps_level = level;
- state->btps_doupper = doupper;
-
- return((void *) state);
+ BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
+
+ memset((char *) state, 0, sizeof(BTPageState));
+ _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
+ state->btps_firstoff = InvalidOffsetNumber;
+ state->btps_lastoff = P_HIKEY;
+ state->btps_lastbti = (BTItem) NULL;
+ state->btps_next = (BTPageState *) NULL;
+ state->btps_level = level;
+ state->btps_doupper = doupper;
+
+ return ((void *) state);
}
/*
@@ -834,19 +870,19 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper)
* the page to which the item used to point, e.g., a heap page if
* 'opage' is a leaf page).
*/
-static BTItem
+static BTItem
_bt_minitem(Page opage, BlockNumber oblkno, int atend)
{
- OffsetNumber off;
- BTItem obti;
- BTItem nbti;
+ OffsetNumber off;
+ BTItem obti;
+ BTItem nbti;
- off = atend ? P_HIKEY : P_FIRSTKEY;
- obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off));
- nbti = _bt_formitem(&(obti->bti_itup));
- ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY);
+ off = atend ? P_HIKEY : P_FIRSTKEY;
+ obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off));
+ nbti = _bt_formitem(&(obti->bti_itup));
+ ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY);
- return(nbti);
+ return (nbti);
}
/*
@@ -855,26 +891,26 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
* we must be careful to observe the following restrictions, placed
* upon us by the conventions in nbtsearch.c:
* - rightmost pages start data items at P_HIKEY instead of at
- * P_FIRSTKEY.
+ * P_FIRSTKEY.
* - duplicates cannot be split among pages unless the chain of
- * duplicates starts at the first data item.
+ * duplicates starts at the first data item.
*
* a leaf page being built looks like:
*
* +----------------+---------------------------------+
- * | PageHeaderData | linp0 linp1 linp2 ... |
+ * | PageHeaderData | linp0 linp1 linp2 ... |
* +-----------+----+---------------------------------+
- * | ... linpN | ^ first |
+ * | ... linpN | ^ first |
* +-----------+--------------------------------------+
- * | ^ last |
- * | |
- * | v last |
+ * | ^ last |
+ * | |
+ * | v last |
* +-------------+------------------------------------+
- * | | itemN ... |
+ * | | itemN ... |
* +-------------+------------------+-----------------+
- * | ... item3 item2 item1 | "special space" |
+ * | ... item3 item2 item1 | "special space" |
* +--------------------------------+-----------------+
- * ^ first
+ * ^ first
*
* contrast this with the diagram in bufpage.h; note the mismatch
* between linps and items. this is because we reserve linp0 as a
@@ -888,216 +924,230 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
*
* if all keys are unique, 'first' will always be the same as 'last'.
*/
-static BTItem
+static BTItem
_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags)
{
- BTPageState *state = (BTPageState *) pstate;
- Buffer nbuf;
- Page npage;
- BTItem last_bti;
- OffsetNumber first_off;
- OffsetNumber last_off;
- OffsetNumber off;
- Size pgspc;
- Size btisz;
-
- nbuf = state->btps_buf;
- npage = state->btps_page;
- first_off = state->btps_firstoff;
- last_off = state->btps_lastoff;
- last_bti = state->btps_lastbti;
-
- pgspc = PageGetFreeSpace(npage);
- btisz = BTITEMSZ(bti);
- btisz = DOUBLEALIGN(btisz);
- if (pgspc < btisz) {
- Buffer obuf = nbuf;
- Page opage = npage;
- OffsetNumber o, n;
- ItemId ii;
- ItemId hii;
-
- _bt_blnewpage(index, &nbuf, &npage, flags);
+ BTPageState *state = (BTPageState *) pstate;
+ Buffer nbuf;
+ Page npage;
+ BTItem last_bti;
+ OffsetNumber first_off;
+ OffsetNumber last_off;
+ OffsetNumber off;
+ Size pgspc;
+ Size btisz;
+
+ nbuf = state->btps_buf;
+ npage = state->btps_page;
+ first_off = state->btps_firstoff;
+ last_off = state->btps_lastoff;
+ last_bti = state->btps_lastbti;
+
+ pgspc = PageGetFreeSpace(npage);
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (pgspc < btisz)
+ {
+ Buffer obuf = nbuf;
+ Page opage = npage;
+ OffsetNumber o,
+ n;
+ ItemId ii;
+ ItemId hii;
- /*
- * if 'last' is part of a chain of duplicates that does not
- * start at the beginning of the old page, the entire chain is
- * copied to the new page; we delete all of the duplicates
- * from the old page except the first, which becomes the high
- * key item of the old page.
- *
- * if the chain starts at the beginning of the page or there
- * is no chain ('first' == 'last'), we need only copy 'last'
- * to the new page. again, 'first' (== 'last') becomes the
- * high key of the old page.
- *
- * note that in either case, we copy at least one item to the
- * new page, so 'last_bti' will always be valid. 'bti' will
- * never be the first data item on the new page.
- */
- if (first_off == P_FIRSTKEY) {
- Assert(last_off != P_FIRSTKEY);
- first_off = last_off;
- }
- for (o = first_off, n = P_FIRSTKEY;
- o <= last_off;
- o = OffsetNumberNext(o), n = OffsetNumberNext(n)) {
- ii = PageGetItemId(opage, o);
- if ( PageAddItem(npage, PageGetItem(opage, ii),
- ii->lp_len, n, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add item to the page in _bt_sort (1)");
+ _bt_blnewpage(index, &nbuf, &npage, flags);
+
+ /*
+ * if 'last' is part of a chain of duplicates that does not start
+ * at the beginning of the old page, the entire chain is copied to
+ * the new page; we delete all of the duplicates from the old page
+ * except the first, which becomes the high key item of the old
+ * page.
+ *
+ * if the chain starts at the beginning of the page or there is no
+ * chain ('first' == 'last'), we need only copy 'last' to the new
+ * page. again, 'first' (== 'last') becomes the high key of the
+ * old page.
+ *
+ * note that in either case, we copy at least one item to the new
+ * page, so 'last_bti' will always be valid. 'bti' will never be
+ * the first data item on the new page.
+ */
+ if (first_off == P_FIRSTKEY)
+ {
+ Assert(last_off != P_FIRSTKEY);
+ first_off = last_off;
+ }
+ for (o = first_off, n = P_FIRSTKEY;
+ o <= last_off;
+ o = OffsetNumberNext(o), n = OffsetNumberNext(n))
+ {
+ ii = PageGetItemId(opage, o);
+ if (PageAddItem(npage, PageGetItem(opage, ii),
+ ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
#if 0
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
- {
- bool isnull;
- BTItem tmpbti =
- (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
- Datum d = index_getattr(&(tmpbti->bti_itup), 1,
- index->rd_att, &isnull);
- printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
- d, n, state->btps_level);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+ {
+ bool isnull;
+ BTItem tmpbti =
+ (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
+ Datum d = index_getattr(&(tmpbti->bti_itup), 1,
+ index->rd_att, &isnull);
+
+ printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
+ d, n, state->btps_level);
+ }
+#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
- }
- /*
- * this loop is backward because PageIndexTupleDelete shuffles
- * the tuples to fill holes in the page -- by starting at the
- * end and working back, we won't create holes (and thereby
- * avoid shuffling).
- */
- for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) {
- PageIndexTupleDelete(opage, o);
- }
- hii = PageGetItemId(opage, P_HIKEY);
- ii = PageGetItemId(opage, first_off);
- *hii = *ii;
- ii->lp_flags &= ~LP_USED;
- ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
+ }
- first_off = P_FIRSTKEY;
- last_off = PageGetMaxOffsetNumber(npage);
- last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
+ /*
+ * this loop is backward because PageIndexTupleDelete shuffles the
+ * tuples to fill holes in the page -- by starting at the end and
+ * working back, we won't create holes (and thereby avoid
+ * shuffling).
+ */
+ for (o = last_off; o > first_off; o = OffsetNumberPrev(o))
+ {
+ PageIndexTupleDelete(opage, o);
+ }
+ hii = PageGetItemId(opage, P_HIKEY);
+ ii = PageGetItemId(opage, first_off);
+ *hii = *ii;
+ ii->lp_flags &= ~LP_USED;
+ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
- /*
- * set the page (side link) pointers.
- */
- {
- BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
- BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
-
- oopaque->btpo_next = BufferGetBlockNumber(nbuf);
- nopaque->btpo_prev = BufferGetBlockNumber(obuf);
- nopaque->btpo_next = P_NONE;
-
- if ( _bt_itemcmp(index, _bt_nattr,
- (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
- (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
- BTEqualStrategyNumber) )
- oopaque->btpo_flags |= BTP_CHAIN;
- }
+ first_off = P_FIRSTKEY;
+ last_off = PageGetMaxOffsetNumber(npage);
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
- /*
- * copy the old buffer's minimum key to its parent. if we
- * don't have a parent, we have to create one; this adds a new
- * btree level.
- */
- if (state->btps_doupper) {
- BTItem nbti;
-
- if (state->btps_next == (BTPageState *) NULL) {
- state->btps_next =
- _bt_pagestate(index, 0, state->btps_level + 1, true);
- }
- nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
- _bt_buildadd(index, state->btps_next, nbti, 0);
- pfree((void *) nbti);
+ /*
+ * set the page (side link) pointers.
+ */
+ {
+ BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
+ BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
+
+ oopaque->btpo_next = BufferGetBlockNumber(nbuf);
+ nopaque->btpo_prev = BufferGetBlockNumber(obuf);
+ nopaque->btpo_next = P_NONE;
+
+ if (_bt_itemcmp(index, _bt_nattr,
+ (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
+ (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
+ BTEqualStrategyNumber))
+ oopaque->btpo_flags |= BTP_CHAIN;
+ }
+
+ /*
+ * copy the old buffer's minimum key to its parent. if we don't
+ * have a parent, we have to create one; this adds a new btree
+ * level.
+ */
+ if (state->btps_doupper)
+ {
+ BTItem nbti;
+
+ if (state->btps_next == (BTPageState *) NULL)
+ {
+ state->btps_next =
+ _bt_pagestate(index, 0, state->btps_level + 1, true);
+ }
+ nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
+ _bt_buildadd(index, state->btps_next, nbti, 0);
+ pfree((void *) nbti);
+ }
+
+ /*
+ * write out the old stuff. we never want to see it again, so we
+ * can give up our lock (if we had one; BuildingBtree is set, so
+ * we aren't locking).
+ */
+ _bt_wrtbuf(index, obuf);
}
/*
- * write out the old stuff. we never want to see it again, so
- * we can give up our lock (if we had one; BuildingBtree is
- * set, so we aren't locking).
+ * if this item is different from the last item added, we start a new
+ * chain of duplicates.
*/
- _bt_wrtbuf(index, obuf);
- }
-
- /*
- * if this item is different from the last item added, we start a
- * new chain of duplicates.
- */
- off = OffsetNumberNext(last_off);
- if ( PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber )
- elog (FATAL, "btree: failed to add item to the page in _bt_sort (2)");
+ off = OffsetNumberNext(last_off);
+ if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
#if 0
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
- {
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
- printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
- d, off, state->btps_level);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
+
+ printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
+ d, off, state->btps_level);
+ }
+#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
#endif
- if (last_bti == (BTItem) NULL)
- {
- first_off = P_FIRSTKEY;
- }
- else if ( !_bt_itemcmp(index, _bt_nattr,
- bti, last_bti, BTEqualStrategyNumber) )
- {
- first_off = off;
- }
- last_off = off;
- last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
-
- state->btps_buf = nbuf;
- state->btps_page = npage;
- state->btps_lastbti = last_bti;
- state->btps_lastoff = last_off;
- state->btps_firstoff = first_off;
-
- return(last_bti);
+ if (last_bti == (BTItem) NULL)
+ {
+ first_off = P_FIRSTKEY;
+ }
+ else if (!_bt_itemcmp(index, _bt_nattr,
+ bti, last_bti, BTEqualStrategyNumber))
+ {
+ first_off = off;
+ }
+ last_off = off;
+ last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
+
+ state->btps_buf = nbuf;
+ state->btps_page = npage;
+ state->btps_lastbti = last_bti;
+ state->btps_lastoff = last_off;
+ state->btps_firstoff = first_off;
+
+ return (last_bti);
}
static void
-_bt_uppershutdown(Relation index, BTPageState *state)
+_bt_uppershutdown(Relation index, BTPageState * state)
{
- BTPageState *s;
- BlockNumber blkno;
- BTPageOpaque opaque;
- BTItem bti;
+ BTPageState *s;
+ BlockNumber blkno;
+ BTPageOpaque opaque;
+ BTItem bti;
- for (s = state; s != (BTPageState *) NULL; s = s->btps_next) {
- blkno = BufferGetBlockNumber(s->btps_buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
+ for (s = state; s != (BTPageState *) NULL; s = s->btps_next)
+ {
+ blkno = BufferGetBlockNumber(s->btps_buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
- /*
- * if this is the root, attach it to the metapage. otherwise,
- * stick the minimum key of the last page on this level (which
- * has not been split, or else it wouldn't be the last page)
- * into its parent. this may cause the last page of upper
- * levels to split, but that's not a problem -- we haven't
- * gotten to them yet.
- */
- if (s->btps_doupper) {
- if (s->btps_next == (BTPageState *) NULL) {
- opaque->btpo_flags |= BTP_ROOT;
- _bt_metaproot(index, blkno, s->btps_level + 1);
- } else {
- bti = _bt_minitem(s->btps_page, blkno, 0);
- _bt_buildadd(index, s->btps_next, bti, 0);
- pfree((void *) bti);
- }
- }
+ /*
+ * if this is the root, attach it to the metapage. otherwise,
+ * stick the minimum key of the last page on this level (which has
+ * not been split, or else it wouldn't be the last page) into its
+ * parent. this may cause the last page of upper levels to split,
+ * but that's not a problem -- we haven't gotten to them yet.
+ */
+ if (s->btps_doupper)
+ {
+ if (s->btps_next == (BTPageState *) NULL)
+ {
+ opaque->btpo_flags |= BTP_ROOT;
+ _bt_metaproot(index, blkno, s->btps_level + 1);
+ }
+ else
+ {
+ bti = _bt_minitem(s->btps_page, blkno, 0);
+ _bt_buildadd(index, s->btps_next, bti, 0);
+ pfree((void *) bti);
+ }
+ }
- /*
- * this is the rightmost page, so the ItemId array needs to be
- * slid back one slot.
- */
- _bt_slideleft(index, s->btps_buf, s->btps_page);
- _bt_wrtbuf(index, s->btps_buf);
- }
+ /*
+ * this is the rightmost page, so the ItemId array needs to be
+ * slid back one slot.
+ */
+ _bt_slideleft(index, s->btps_buf, s->btps_page);
+ _bt_wrtbuf(index, s->btps_buf);
+ }
}
/*
@@ -1105,203 +1155,230 @@ _bt_uppershutdown(Relation index, BTPageState *state)
* merging passes until at most one run is left in each tape. at that
* point, merge the final tape runs into a set of btree leaves.
*
- * XXX three nested loops? gross. cut me up into smaller routines.
+ * XXX three nested loops? gross. cut me up into smaller routines.
*/
static void
-_bt_merge(Relation index, BTSpool *btspool)
+_bt_merge(Relation index, BTSpool * btspool)
{
- BTPageState *state;
- BTPriQueue q;
- BTPriQueueElem e;
- BTSortKey btsk;
- BTItem bti;
- BTTapeBlock *itape;
- BTTapeBlock *otape;
- char *tapepos[MAXTAPES];
- int tapedone[MAXTAPES];
- int t;
- int goodtapes;
- int npass;
- int nruns;
- Size btisz;
- bool doleaf = false;
-
- /*
- * initialize state needed for the merge into the btree leaf pages.
- */
- state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
-
- npass = 0;
- do { /* pass */
+ BTPageState *state;
+ BTPriQueue q;
+ BTPriQueueElem e;
+ BTSortKey btsk;
+ BTItem bti;
+ BTTapeBlock *itape;
+ BTTapeBlock *otape;
+ char *tapepos[MAXTAPES];
+ int tapedone[MAXTAPES];
+ int t;
+ int goodtapes;
+ int npass;
+ int nruns;
+ Size btisz;
+ bool doleaf = false;
+
/*
- * each pass starts by flushing the previous outputs and
- * swapping inputs and outputs. flushing sets End-of-Run for
- * any dirty output tapes. swapping clears the new output
- * tapes and rewinds the new input tapes.
+ * initialize state needed for the merge into the btree leaf pages.
*/
- btspool->bts_tape = btspool->bts_ntapes - 1;
- _bt_spoolflush(btspool);
- _bt_spoolswap(btspool);
-
- ++npass;
- nruns = 0;
-
- for (;;) { /* run */
- /*
- * each run starts by selecting a new output tape. the
- * merged results of a given run are always sent to this
- * one tape.
- */
- btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
- otape = btspool->bts_otape[btspool->bts_tape];
-
- /*
- * initialize the priority queue by loading it with the
- * first element of the given run in each tape. since we
- * are starting a new run, we reset the tape (clearing the
- * End-Of-Run marker) before reading it. this means that
- * _bt_taperead will return 0 only if the tape is actually
- * at EOF.
- */
- memset((char *) &q, 0, sizeof(BTPriQueue));
- goodtapes = 0;
- for (t = 0; t < btspool->bts_ntapes; ++t) {
- itape = btspool->bts_itape[t];
- tapepos[t] = itape->bttb_data;
- tapedone[t] = 0;
- _bt_tapereset(itape);
- do {
- if (_bt_taperead(itape) == 0) {
- tapedone[t] = 1;
- }
- } while (!tapedone[t] && EMPTYTAPE(itape));
- if (!tapedone[t]) {
- ++goodtapes;
- e.btpqe_tape = t;
- _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
- &(e.btpqe_item));
- if (e.btpqe_item.btsk_item != (BTItem) NULL) {
- _bt_pqadd(&q, &e);
- }
- }
- }
- /*
- * if we don't have any tapes with any input (i.e., they
- * are all at EOF), there is no work to do in this run --
- * we must be done with this pass.
- */
- if (goodtapes == 0) {
- break; /* for */
- }
- ++nruns;
-
- /*
- * output the smallest element from the queue until there
- * are no more.
- */
- while (_bt_pqnext(&q, &e) >= 0) { /* item */
+ state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true);
+
+ npass = 0;
+ do
+ { /* pass */
+
/*
- * replace the element taken from priority queue,
- * fetching a new block if needed. a tape can run out
- * if it hits either End-Of-Run or EOF.
+ * each pass starts by flushing the previous outputs and swapping
+ * inputs and outputs. flushing sets End-of-Run for any dirty
+ * output tapes. swapping clears the new output tapes and rewinds
+ * the new input tapes.
*/
- t = e.btpqe_tape;
- btsk = e.btpqe_item;
- bti = btsk.btsk_item;
- if (bti != (BTItem) NULL) {
- btisz = BTITEMSZ(bti);
- btisz = DOUBLEALIGN(btisz);
- if (doleaf) {
- _bt_buildadd(index, state, bti, BTP_LEAF);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+ btspool->bts_tape = btspool->bts_ntapes - 1;
+ _bt_spoolflush(btspool);
+ _bt_spoolswap(btspool);
+
+ ++npass;
+ nruns = 0;
+
+ for (;;)
+ { /* run */
+
+ /*
+ * each run starts by selecting a new output tape. the merged
+ * results of a given run are always sent to this one tape.
+ */
+ btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes;
+ otape = btspool->bts_otape[btspool->bts_tape];
+
+ /*
+ * initialize the priority queue by loading it with the first
+ * element of the given run in each tape. since we are
+ * starting a new run, we reset the tape (clearing the
+ * End-Of-Run marker) before reading it. this means that
+ * _bt_taperead will return 0 only if the tape is actually at
+ * EOF.
+ */
+ memset((char *) &q, 0, sizeof(BTPriQueue));
+ goodtapes = 0;
+ for (t = 0; t < btspool->bts_ntapes; ++t)
{
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1,
- index->rd_att, &isnull);
- printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
- npass, nruns, d, t,
- BufferGetBlockNumber(state->btps_buf));
+ itape = btspool->bts_itape[t];
+ tapepos[t] = itape->bttb_data;
+ tapedone[t] = 0;
+ _bt_tapereset(itape);
+ do
+ {
+ if (_bt_taperead(itape) == 0)
+ {
+ tapedone[t] = 1;
+ }
+ } while (!tapedone[t] && EMPTYTAPE(itape));
+ if (!tapedone[t])
+ {
+ ++goodtapes;
+ e.btpqe_tape = t;
+ _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]),
+ &(e.btpqe_item));
+ if (e.btpqe_item.btsk_item != (BTItem) NULL)
+ {
+ _bt_pqadd(&q, &e);
+ }
+ }
}
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
- } else {
- if (SPCLEFT(otape) < btisz) {
- /*
- * if it's full, write it out and add the
- * item to the next block. (since we will
- * be adding another tuple immediately
- * after this, we can be sure that there
- * will be at least one more block in this
- * run and so we know we do *not* want to
- * set End-Of-Run here.)
- */
- _bt_tapewrite(otape, 0);
- }
- _bt_tapeadd(otape, bti, btisz);
-#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+
+ /*
+ * if we don't have any tapes with any input (i.e., they are
+ * all at EOF), there is no work to do in this run -- we must
+ * be done with this pass.
+ */
+ if (goodtapes == 0)
{
- bool isnull;
- Datum d = index_getattr(&(bti->bti_itup), 1,
- index->rd_att, &isnull);
- printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
- npass, nruns, d, t,
- btspool->bts_tape);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
- }
-
- if ( btsk.btsk_datum != (Datum*) NULL )
- pfree ((void*)(btsk.btsk_datum));
- if ( btsk.btsk_nulls != (char*) NULL )
- pfree ((void*)(btsk.btsk_nulls));
-
- }
- itape = btspool->bts_itape[t];
- if (!tapedone[t]) {
- BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
-
- if (newbti == (BTItem) NULL) {
- do {
- if (_bt_taperead(itape) == 0) {
- tapedone[t] = 1;
- }
- } while (!tapedone[t] && EMPTYTAPE(itape));
- if (!tapedone[t]) {
- tapepos[t] = itape->bttb_data;
- newbti = _bt_tapenext(itape, &tapepos[t]);
+ break; /* for */
}
- }
- if (newbti != (BTItem) NULL) {
- BTPriQueueElem nexte;
-
- nexte.btpqe_tape = t;
- _bt_setsortkey(index, newbti, &(nexte.btpqe_item));
- _bt_pqadd(&q, &nexte);
- }
+ ++nruns;
+
+ /*
+ * output the smallest element from the queue until there are
+ * no more.
+ */
+ while (_bt_pqnext(&q, &e) >= 0)
+ { /* item */
+
+ /*
+ * replace the element taken from priority queue, fetching
+ * a new block if needed. a tape can run out if it hits
+ * either End-Of-Run or EOF.
+ */
+ t = e.btpqe_tape;
+ btsk = e.btpqe_item;
+ bti = btsk.btsk_item;
+ if (bti != (BTItem) NULL)
+ {
+ btisz = BTITEMSZ(bti);
+ btisz = DOUBLEALIGN(btisz);
+ if (doleaf)
+ {
+ _bt_buildadd(index, state, bti, BTP_LEAF);
+#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ index->rd_att, &isnull);
+
+ printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n",
+ npass, nruns, d, t,
+ BufferGetBlockNumber(state->btps_buf));
+ }
+#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+ }
+ else
+ {
+ if (SPCLEFT(otape) < btisz)
+ {
+
+ /*
+ * if it's full, write it out and add the item
+ * to the next block. (since we will be
+ * adding another tuple immediately after
+ * this, we can be sure that there will be at
+ * least one more block in this run and so we
+ * know we do *not* want to set End-Of-Run
+ * here.)
+ */
+ _bt_tapewrite(otape, 0);
+ }
+ _bt_tapeadd(otape, bti, btisz);
+#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
+ {
+ bool isnull;
+ Datum d = index_getattr(&(bti->bti_itup), 1,
+ index->rd_att, &isnull);
+
+ printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n",
+ npass, nruns, d, t,
+ btspool->bts_tape);
+ }
+#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+ }
+
+ if (btsk.btsk_datum != (Datum *) NULL)
+ pfree((void *) (btsk.btsk_datum));
+ if (btsk.btsk_nulls != (char *) NULL)
+ pfree((void *) (btsk.btsk_nulls));
+
+ }
+ itape = btspool->bts_itape[t];
+ if (!tapedone[t])
+ {
+ BTItem newbti = _bt_tapenext(itape, &tapepos[t]);
+
+ if (newbti == (BTItem) NULL)
+ {
+ do
+ {
+ if (_bt_taperead(itape) == 0)
+ {
+ tapedone[t] = 1;
+ }
+ } while (!tapedone[t] && EMPTYTAPE(itape));
+ if (!tapedone[t])
+ {
+ tapepos[t] = itape->bttb_data;
+ newbti = _bt_tapenext(itape, &tapepos[t]);
+ }
+ }
+ if (newbti != (BTItem) NULL)
+ {
+ BTPriQueueElem nexte;
+
+ nexte.btpqe_tape = t;
+ _bt_setsortkey(index, newbti, &(nexte.btpqe_item));
+ _bt_pqadd(&q, &nexte);
+ }
+ }
+ } /* item */
+
+ /*
+ * that's it for this run. flush the output tape, marking
+ * End-of-Run.
+ */
+ _bt_tapewrite(otape, 1);
+ } /* run */
+
+ /*
+ * we are here because we ran out of input on all of the input
+ * tapes.
+ *
+ * if this pass did not generate more actual output runs than we have
+ * tapes, we know we have at most one run in each tape. this
+ * means that we are ready to merge into the final btree leaf
+ * pages instead of merging into a tape file.
+ */
+ if (nruns <= btspool->bts_ntapes)
+ {
+ doleaf = true;
}
- } /* item */
-
- /*
- * that's it for this run. flush the output tape, marking
- * End-of-Run.
- */
- _bt_tapewrite(otape, 1);
- } /* run */
-
- /*
- * we are here because we ran out of input on all of the input
- * tapes.
- *
- * if this pass did not generate more actual output runs than
- * we have tapes, we know we have at most one run in each
- * tape. this means that we are ready to merge into the final
- * btree leaf pages instead of merging into a tape file.
- */
- if (nruns <= btspool->bts_ntapes) {
- doleaf = true;
- }
- } while (nruns > 0); /* pass */
+ } while (nruns > 0); /* pass */
- _bt_uppershutdown(index, state);
+ _bt_uppershutdown(index, state);
}
@@ -1320,62 +1397,65 @@ _bt_merge(Relation index, BTSpool *btspool)
void
_bt_upperbuild(Relation index)
{
- Buffer rbuf;
- BlockNumber blk;
- Page rpage;
- BTPageOpaque ropaque;
- BTPageState *state;
- BTItem nbti;
-
- /*
- * find the first leaf block. while we're at it, clear the
- * BTP_ROOT flag that we set while building it (so we could find
- * it later).
- */
- rbuf = _bt_getroot(index, BT_WRITE);
- blk = BufferGetBlockNumber(rbuf);
- rpage = BufferGetPage(rbuf);
- ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
- ropaque->btpo_flags &= ~BTP_ROOT;
- _bt_wrtbuf(index, rbuf);
-
- state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
-
- /* for each page... */
- do {
-#if 0
- printf("\t\tblk=%d\n", blk);
-#endif
- rbuf = _bt_getbuf(index, blk, BT_READ);
+ Buffer rbuf;
+ BlockNumber blk;
+ Page rpage;
+ BTPageOpaque ropaque;
+ BTPageState *state;
+ BTItem nbti;
+
+ /*
+ * find the first leaf block. while we're at it, clear the BTP_ROOT
+ * flag that we set while building it (so we could find it later).
+ */
+ rbuf = _bt_getroot(index, BT_WRITE);
+ blk = BufferGetBlockNumber(rbuf);
rpage = BufferGetPage(rbuf);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
-
- /* for each item... */
- if (!PageIsEmpty(rpage)) {
- /*
- * form a new index tuple corresponding to the minimum key
- * of the lower page and insert it into a page at this
- * level.
- */
- nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
+ ropaque->btpo_flags &= ~BTP_ROOT;
+ _bt_wrtbuf(index, rbuf);
+
+ state = (BTPageState *) _bt_pagestate(index, 0, 0, true);
+
+ /* for each page... */
+ do
+ {
+#if 0
+ printf("\t\tblk=%d\n", blk);
+#endif
+ rbuf = _bt_getbuf(index, blk, BT_READ);
+ rpage = BufferGetPage(rbuf);
+ ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+ /* for each item... */
+ if (!PageIsEmpty(rpage))
+ {
+
+ /*
+ * form a new index tuple corresponding to the minimum key of
+ * the lower page and insert it into a page at this level.
+ */
+ nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque));
#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE)
- {
- bool isnull;
- Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
- &isnull);
- printf("_bt_upperbuild: inserting <%x> at %d\n",
- d, state->btps_level);
- }
-#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
- _bt_buildadd(index, state, nbti, 0);
- pfree((void *) nbti);
- }
- blk = ropaque->btpo_next;
- _bt_relbuf(index, rbuf, BT_READ);
- } while (blk != P_NONE);
-
- _bt_uppershutdown(index, state);
+ {
+ bool isnull;
+ Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att,
+ &isnull);
+
+ printf("_bt_upperbuild: inserting <%x> at %d\n",
+ d, state->btps_level);
+ }
+#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */
+ _bt_buildadd(index, state, nbti, 0);
+ pfree((void *) nbti);
+ }
+ blk = ropaque->btpo_next;
+ _bt_relbuf(index, rbuf, BT_READ);
+ } while (blk != P_NONE);
+
+ _bt_uppershutdown(index, state);
}
+
#endif
/*
@@ -1385,17 +1465,17 @@ _bt_upperbuild(Relation index)
void
_bt_leafbuild(Relation index, void *spool)
{
- _bt_isortcmpinit (index, (BTSpool *) spool);
+ _bt_isortcmpinit(index, (BTSpool *) spool);
#ifdef BTREE_BUILD_STATS
- if ( ShowExecutorStats )
- {
- fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
- ShowUsage ();
- ResetUsage ();
- }
+ if (ShowExecutorStats)
+ {
+ fprintf(stderr, "! BtreeBuild (Spool) Stats:\n");
+ ShowUsage();
+ ResetUsage();
+ }
#endif
- _bt_merge(index, (BTSpool *) spool);
+ _bt_merge(index, (BTSpool *) spool);
}
diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c
index 6de003c06a9..5215d2000d8 100644
--- a/src/backend/access/nbtree/nbtstrat.c
+++ b/src/backend/access/nbtree/nbtstrat.c
@@ -1,13 +1,13 @@
/*-------------------------------------------------------------------------
*
* btstrat.c--
- * Srategy map entries for the btree indexed access method
+ * Srategy map entries for the btree indexed access method
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.4 1996/11/05 10:35:37 scrappy Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.5 1997/09/07 04:39:04 momjian Exp $
*
*-------------------------------------------------------------------------
*/
@@ -20,111 +20,111 @@
/*
* Note:
- * StrategyNegate, StrategyCommute, and StrategyNegateCommute
- * assume <, <=, ==, >=, > ordering.
+ * StrategyNegate, StrategyCommute, and StrategyNegateCommute
+ * assume <, <=, ==, >=, > ordering.
*/
-static StrategyNumber BTNegate[5] = {
- BTGreaterEqualStrategyNumber,
- BTGreaterStrategyNumber,
- InvalidStrategy,
- BTLessStrategyNumber,
- BTLessEqualStrategyNumber
+static StrategyNumber BTNegate[5] = {
+ BTGreaterEqualStrategyNumber,
+ BTGreaterStrategyNumber,
+ InvalidStrategy,
+ BTLessStrategyNumber,
+ BTLessEqualStrategyNumber
};
-static StrategyNumber BTCommute[5] = {
- BTGreaterStrategyNumber,
- BTGreaterEqualStrategyNumber,
- InvalidStrategy,
- BTLessEqualStrategyNumber,
- BTLessStrategyNumber
+static StrategyNumber BTCommute[5] = {
+ BTGreaterStrategyNumber,
+ BTGreaterEqualStrategyNumber,
+ InvalidStrategy,
+ BTLessEqualStrategyNumber,
+ BTLessStrategyNumber
};
-static StrategyNumber BTNegateCommute[5] = {
- BTLessEqualStrategyNumber,
- BTLessStrategyNumber,
- InvalidStrategy,
- BTGreaterStrategyNumber,
- BTGreaterEqualStrategyNumber
+static StrategyNumber BTNegateCommute[5] = {
+ BTLessEqualStrategyNumber,
+ BTLessStrategyNumber,
+ InvalidStrategy,
+ BTGreaterStrategyNumber,
+ BTGreaterEqualStrategyNumber
};
-static uint16 BTLessTermData[] = { /* XXX type clash */
- 2,
- BTLessStrategyNumber,
- SK_NEGATE,
- BTLessStrategyNumber,
- SK_NEGATE | SK_COMMUTE
+static uint16 BTLessTermData[] = { /* XXX type clash */
+ 2,
+ BTLessStrategyNumber,
+ SK_NEGATE,
+ BTLessStrategyNumber,
+ SK_NEGATE | SK_COMMUTE
};
-static uint16 BTLessEqualTermData[] = { /* XXX type clash */
- 2,
- BTLessEqualStrategyNumber,
- 0x0,
- BTLessEqualStrategyNumber,
- SK_COMMUTE
+static uint16 BTLessEqualTermData[] = { /* XXX type clash */
+ 2,
+ BTLessEqualStrategyNumber,
+ 0x0,
+ BTLessEqualStrategyNumber,
+ SK_COMMUTE
};
static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */
- 2,
- BTGreaterEqualStrategyNumber,
- 0x0,
- BTGreaterEqualStrategyNumber,
- SK_COMMUTE
- };
-
-static uint16 BTGreaterTermData[] = { /* XXX type clash */
- 2,
- BTGreaterStrategyNumber,
- SK_NEGATE,
- BTGreaterStrategyNumber,
- SK_NEGATE | SK_COMMUTE
+ 2,
+ BTGreaterEqualStrategyNumber,
+ 0x0,
+ BTGreaterEqualStrategyNumber,
+ SK_COMMUTE
};
-static StrategyTerm BTEqualExpressionData[] = {
- (StrategyTerm)BTLessTermData, /* XXX */
- (StrategyTerm)BTLessEqualTermData, /* XXX */
- (StrategyTerm)BTGreaterEqualTermData, /* XXX */
- (StrategyTerm)BTGreaterTermData, /* XXX */
- NULL
+static uint16 BTGreaterTermData[] = { /* XXX type clash */
+ 2,
+ BTGreaterStrategyNumber,
+ SK_NEGATE,
+ BTGreaterStrategyNumber,
+ SK_NEGATE | SK_COMMUTE
};
-static StrategyEvaluationData BTEvaluationData = {
- /* XXX static for simplicity */
-
- BTMaxStrategyNumber,
- (StrategyTransformMap)BTNegate, /* XXX */
- (StrategyTransformMap)BTCommute, /* XXX */
- (StrategyTransformMap)BTNegateCommute, /* XXX */
+static StrategyTerm BTEqualExpressionData[] = {
+ (StrategyTerm) BTLessTermData, /* XXX */
+ (StrategyTerm) BTLessEqualTermData, /* XXX */
+ (StrategyTerm) BTGreaterEqualTermData, /* XXX */
+ (StrategyTerm) BTGreaterTermData, /* XXX */
+ NULL
+};
+
+static StrategyEvaluationData BTEvaluationData = {
+ /* XXX static for simplicity */
+
+ BTMaxStrategyNumber,
+ (StrategyTransformMap) BTNegate, /* XXX */
+ (StrategyTransformMap) BTCommute, /* XXX */
+ (StrategyTransformMap) BTNegateCommute, /* XXX */
- { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
- NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+ {NULL, NULL, (StrategyExpression) BTEqualExpressionData, NULL, NULL,
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL}
};
/* ----------------------------------------------------------------
- * RelationGetBTStrategy
+ * RelationGetBTStrategy
* ----------------------------------------------------------------
*/
StrategyNumber
_bt_getstrat(Relation rel,
- AttrNumber attno,
- RegProcedure proc)
+ AttrNumber attno,
+ RegProcedure proc)
{
- StrategyNumber strat;
-
- strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
-
- Assert(StrategyNumberIsValid(strat));
-
- return (strat);
+ StrategyNumber strat;
+
+ strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
+
+ Assert(StrategyNumberIsValid(strat));
+
+ return (strat);
}
bool
_bt_invokestrat(Relation rel,
- AttrNumber attno,
- StrategyNumber strat,
- Datum left,
- Datum right)
+ AttrNumber attno,
+ StrategyNumber strat,
+ Datum left,
+ Datum right)
{
- return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat,
- left, right));
+ return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat,
+ left, right));
}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 738e55dbccd..096f1d2691e 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1,13 +1,13 @@
/*-------------------------------------------------------------------------
*
* btutils.c--
- * Utility code for Postgres btree implementation.
+ * Utility code for Postgres btree implementation.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.11 1997/08/19 21:29:47 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.12 1997/09/07 04:39:05 momjian Exp $
*
*-------------------------------------------------------------------------
*/
@@ -23,367 +23,384 @@
#include <catalog/pg_proc.h>
#include <executor/execdebug.h>
-extern int NIndexTupleProcessed;
+extern int NIndexTupleProcessed;
#ifndef HAVE_MEMMOVE
-# include <regex/utils.h>
+#include <regex/utils.h>
#else
-# include <string.h>
+#include <string.h>
#endif
-ScanKey
+ScanKey
_bt_mkscankey(Relation rel, IndexTuple itup)
-{
- ScanKey skey;
- TupleDesc itupdesc;
- int natts;
- int i;
- Datum arg;
- RegProcedure proc;
- bool null;
- bits16 flag;
-
- natts = rel->rd_rel->relnatts;
- itupdesc = RelationGetTupleDescriptor(rel);
-
- skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
-
- for (i = 0; i < natts; i++) {
- arg = index_getattr(itup, i + 1, itupdesc, &null);
- if ( null )
- {
- proc = NullValueRegProcedure;
- flag = SK_ISNULL;
- }
- else
+{
+ ScanKey skey;
+ TupleDesc itupdesc;
+ int natts;
+ int i;
+ Datum arg;
+ RegProcedure proc;
+ bool null;
+ bits16 flag;
+
+ natts = rel->rd_rel->relnatts;
+ itupdesc = RelationGetTupleDescriptor(rel);
+
+ skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+
+ for (i = 0; i < natts; i++)
{
- proc = index_getprocid(rel, i + 1, BTORDER_PROC);
- flag = 0x0;
+ arg = index_getattr(itup, i + 1, itupdesc, &null);
+ if (null)
+ {
+ proc = NullValueRegProcedure;
+ flag = SK_ISNULL;
+ }
+ else
+ {
+ proc = index_getprocid(rel, i + 1, BTORDER_PROC);
+ flag = 0x0;
+ }
+ ScanKeyEntryInitialize(&skey[i],
+ flag, (AttrNumber) (i + 1), proc, arg);
}
- ScanKeyEntryInitialize(&skey[i],
- flag, (AttrNumber) (i + 1), proc, arg);
- }
-
- return (skey);
+
+ return (skey);
}
void
_bt_freeskey(ScanKey skey)
{
- pfree(skey);
+ pfree(skey);
}
void
_bt_freestack(BTStack stack)
{
- BTStack ostack;
-
- while (stack != (BTStack) NULL) {
- ostack = stack;
- stack = stack->bts_parent;
- pfree(ostack->bts_btitem);
- pfree(ostack);
- }
+ BTStack ostack;
+
+ while (stack != (BTStack) NULL)
+ {
+ ostack = stack;
+ stack = stack->bts_parent;
+ pfree(ostack->bts_btitem);
+ pfree(ostack);
+ }
}
/*
- * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
+ * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
*
- * The order of the keys in the qual match the ordering imposed by
- * the index. This routine only needs to be called if there are
- * more than one qual clauses using this index.
+ * The order of the keys in the qual match the ordering imposed by
+ * the index. This routine only needs to be called if there are
+ * more than one qual clauses using this index.
*/
void
_bt_orderkeys(Relation relation, BTScanOpaque so)
{
- ScanKey xform;
- ScanKeyData *cur;
- StrategyMap map;
- int nbytes;
- long test;
- int i, j;
- int init[BTMaxStrategyNumber+1];
- ScanKey key;
- uint16 numberOfKeys = so->numberOfKeys;
- uint16 new_numberOfKeys = 0;
- AttrNumber attno = 1;
-
- if ( numberOfKeys < 1 )
- return;
-
- key = so->keyData;
-
- cur = &key[0];
- if ( cur->sk_attno != 1 )
- elog (WARN, "_bt_orderkeys: key(s) for attribute 1 missed");
-
- if ( numberOfKeys == 1 )
- {
- /*
- * We don't use indices for 'A is null' and 'A is not null'
- * currently and 'A < = > <> NULL' is non-sense' - so
- * qual is not Ok. - vadim 03/21/97
- */
- if ( cur->sk_flags & SK_ISNULL )
- so->qual_ok = 0;
- so->numberOfFirstKeys = 1;
- return;
- }
-
- /* get space for the modified array of keys */
- nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
- xform = (ScanKey) palloc(nbytes);
-
- memset(xform, 0, nbytes);
- map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
- BTMaxStrategyNumber,
- attno);
- for (j = 0; j <= BTMaxStrategyNumber; j++)
- init[j] = 0;
-
- /* check each key passed in */
- for (i = 0; ; )
- {
- if ( i < numberOfKeys )
- cur = &key[i];
-
- if ( cur->sk_flags & SK_ISNULL ) /* see comments above */
- so->qual_ok = 0;
-
- if ( i == numberOfKeys || cur->sk_attno != attno )
+ ScanKey xform;
+ ScanKeyData *cur;
+ StrategyMap map;
+ int nbytes;
+ long test;
+ int i,
+ j;
+ int init[BTMaxStrategyNumber + 1];
+ ScanKey key;
+ uint16 numberOfKeys = so->numberOfKeys;
+ uint16 new_numberOfKeys = 0;
+ AttrNumber attno = 1;
+
+ if (numberOfKeys < 1)
+ return;
+
+ key = so->keyData;
+
+ cur = &key[0];
+ if (cur->sk_attno != 1)
+ elog(WARN, "_bt_orderkeys: key(s) for attribute 1 missed");
+
+ if (numberOfKeys == 1)
{
- if ( cur->sk_attno != attno + 1 && i < numberOfKeys )
- {
- elog (WARN, "_bt_orderkeys: key(s) for attribute %d missed", attno + 1);
- }
- /*
- * If = has been specified, no other key will be used.
- * In case of key < 2 && key == 1 and so on
- * we have to set qual_ok to 0
- */
- if (init[BTEqualStrategyNumber - 1])
- {
- ScanKeyData *eq, *chk;
-
- eq = &xform[BTEqualStrategyNumber - 1];
- for (j = BTMaxStrategyNumber; --j >= 0; )
- {
- if ( j == (BTEqualStrategyNumber - 1) || init[j] == 0 )
- continue;
- chk = &xform[j];
- test = (long) fmgr(chk->sk_procedure, eq->sk_argument, chk->sk_argument);
- if (!test)
- so->qual_ok = 0;
- }
- init[BTLessStrategyNumber - 1] = 0;
- init[BTLessEqualStrategyNumber - 1] = 0;
- init[BTGreaterEqualStrategyNumber - 1] = 0;
- init[BTGreaterStrategyNumber - 1] = 0;
- }
-
- /* only one of <, <= */
- if (init[BTLessStrategyNumber - 1]
- && init[BTLessEqualStrategyNumber - 1])
- {
- ScanKeyData *lt, *le;
-
- lt = &xform[BTLessStrategyNumber - 1];
- le = &xform[BTLessEqualStrategyNumber - 1];
+
/*
- * DO NOT use the cached function stuff here -- this is key
- * ordering, happens only when the user expresses a hokey
- * qualification, and gets executed only once, anyway. The
- * transform maps are hard-coded, and can't be initialized
- * in the correct way.
+ * We don't use indices for 'A is null' and 'A is not null'
+ * currently and 'A < = > <> NULL' is non-sense' - so qual is not
+ * Ok. - vadim 03/21/97
*/
- test = (long) fmgr(le->sk_procedure, lt->sk_argument, le->sk_argument);
- if (test)
- init[BTLessEqualStrategyNumber - 1] = 0;
- else
- init[BTLessStrategyNumber - 1] = 0;
- }
-
- /* only one of >, >= */
- if (init[BTGreaterStrategyNumber - 1]
- && init[BTGreaterEqualStrategyNumber - 1])
- {
- ScanKeyData *gt, *ge;
-
- gt = &xform[BTGreaterStrategyNumber - 1];
- ge = &xform[BTGreaterEqualStrategyNumber - 1];
-
- /* see note above on function cache */
- test = (long) fmgr(ge->sk_procedure, gt->sk_argument, ge->sk_argument);
- if (test)
- init[BTGreaterEqualStrategyNumber - 1] = 0;
- else
- init[BTGreaterStrategyNumber - 1] = 0;
- }
-
- /* okay, reorder and count */
- for (j = BTMaxStrategyNumber; --j >= 0; )
- if (init[j])
- key[new_numberOfKeys++] = xform[j];
-
- if ( attno == 1 )
- so->numberOfFirstKeys = new_numberOfKeys;
-
- if ( i == numberOfKeys )
- break;
-
- /* initialization for new attno */
- attno = cur->sk_attno;
- memset(xform, 0, nbytes);
- map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
- BTMaxStrategyNumber,
- attno);
- /* haven't looked at any strategies yet */
- for (j = 0; j <= BTMaxStrategyNumber; j++)
- init[j] = 0;
+ if (cur->sk_flags & SK_ISNULL)
+ so->qual_ok = 0;
+ so->numberOfFirstKeys = 1;
+ return;
}
- for (j = BTMaxStrategyNumber; --j >= 0; )
- {
- if (cur->sk_procedure == map->entry[j].sk_procedure)
- break;
- }
-
- /* have we seen one of these before? */
- if (init[j])
- {
- /* yup, use the appropriate value */
- test =
- (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
- cur->sk_argument, xform[j].sk_argument);
- if (test)
- xform[j].sk_argument = cur->sk_argument;
- else if ( j == (BTEqualStrategyNumber - 1) )
- so->qual_ok = 0; /* key == a && key == b, but a != b */
- } else
+ /* get space for the modified array of keys */
+ nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
+ xform = (ScanKey) palloc(nbytes);
+
+ memset(xform, 0, nbytes);
+ map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ BTMaxStrategyNumber,
+ attno);
+ for (j = 0; j <= BTMaxStrategyNumber; j++)
+ init[j] = 0;
+
+ /* check each key passed in */
+ for (i = 0;;)
{
- /* nope, use this value */
- memmove(&xform[j], cur, sizeof(*cur));
- init[j] = 1;
+ if (i < numberOfKeys)
+ cur = &key[i];
+
+ if (cur->sk_flags & SK_ISNULL) /* see comments above */
+ so->qual_ok = 0;
+
+ if (i == numberOfKeys || cur->sk_attno != attno)
+ {
+ if (cur->sk_attno != attno + 1 && i < numberOfKeys)
+ {
+ elog(WARN, "_bt_orderkeys: key(s) for attribute %d missed", attno + 1);
+ }
+
+ /*
+ * If = has been specified, no other key will be used. In case
+ * of key < 2 && key == 1 and so on we have to set qual_ok to
+ * 0
+ */
+ if (init[BTEqualStrategyNumber - 1])
+ {
+ ScanKeyData *eq,
+ *chk;
+
+ eq = &xform[BTEqualStrategyNumber - 1];
+ for (j = BTMaxStrategyNumber; --j >= 0;)
+ {
+ if (j == (BTEqualStrategyNumber - 1) || init[j] == 0)
+ continue;
+ chk = &xform[j];
+ test = (long) fmgr(chk->sk_procedure, eq->sk_argument, chk->sk_argument);
+ if (!test)
+ so->qual_ok = 0;
+ }
+ init[BTLessStrategyNumber - 1] = 0;
+ init[BTLessEqualStrategyNumber - 1] = 0;
+ init[BTGreaterEqualStrategyNumber - 1] = 0;
+ init[BTGreaterStrategyNumber - 1] = 0;
+ }
+
+ /* only one of <, <= */
+ if (init[BTLessStrategyNumber - 1]
+ && init[BTLessEqualStrategyNumber - 1])
+ {
+ ScanKeyData *lt,
+ *le;
+
+ lt = &xform[BTLessStrategyNumber - 1];
+ le = &xform[BTLessEqualStrategyNumber - 1];
+
+ /*
+ * DO NOT use the cached function stuff here -- this is
+ * key ordering, happens only when the user expresses a
+ * hokey qualification, and gets executed only once,
+ * anyway. The transform maps are hard-coded, and can't
+ * be initialized in the correct way.
+ */
+ test = (long) fmgr(le->sk_procedure, lt->sk_argument, le->sk_argument);
+ if (test)
+ init[BTLessEqualStrategyNumber - 1] = 0;
+ else
+ init[BTLessStrategyNumber - 1] = 0;
+ }
+
+ /* only one of >, >= */
+ if (init[BTGreaterStrategyNumber - 1]
+ && init[BTGreaterEqualStrategyNumber - 1])
+ {
+ ScanKeyData *gt,
+ *ge;
+
+ gt = &xform[BTGreaterStrategyNumber - 1];
+ ge = &xform[BTGreaterEqualStrategyNumber - 1];
+
+ /* see note above on function cache */
+ test = (long) fmgr(ge->sk_procedure, gt->sk_argument, ge->sk_argument);
+ if (test)
+ init[BTGreaterEqualStrategyNumber - 1] = 0;
+ else
+ init[BTGreaterStrategyNumber - 1] = 0;
+ }
+
+ /* okay, reorder and count */
+ for (j = BTMaxStrategyNumber; --j >= 0;)
+ if (init[j])
+ key[new_numberOfKeys++] = xform[j];
+
+ if (attno == 1)
+ so->numberOfFirstKeys = new_numberOfKeys;
+
+ if (i == numberOfKeys)
+ break;
+
+ /* initialization for new attno */
+ attno = cur->sk_attno;
+ memset(xform, 0, nbytes);
+ map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
+ BTMaxStrategyNumber,
+ attno);
+ /* haven't looked at any strategies yet */
+ for (j = 0; j <= BTMaxStrategyNumber; j++)
+ init[j] = 0;
+ }
+
+ for (j = BTMaxStrategyNumber; --j >= 0;)
+ {
+ if (cur->sk_procedure == map->entry[j].sk_procedure)
+ break;
+ }
+
+ /* have we seen one of these before? */
+ if (init[j])
+ {
+ /* yup, use the appropriate value */
+ test =
+ (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
+ cur->sk_argument, xform[j].sk_argument);
+ if (test)
+ xform[j].sk_argument = cur->sk_argument;
+ else if (j == (BTEqualStrategyNumber - 1))
+ so->qual_ok = 0;/* key == a && key == b, but a != b */
+ }
+ else
+ {
+ /* nope, use this value */
+ memmove(&xform[j], cur, sizeof(*cur));
+ init[j] = 1;
+ }
+
+ i++;
}
-
- i++;
- }
-
- so->numberOfKeys = new_numberOfKeys;
-
- pfree(xform);
+
+ so->numberOfKeys = new_numberOfKeys;
+
+ pfree(xform);
}
BTItem
_bt_formitem(IndexTuple itup)
{
- int nbytes_btitem;
- BTItem btitem;
- Size tuplen;
- extern Oid newoid();
-
- /* see comments in btbuild
-
- if (itup->t_info & INDEX_NULL_MASK)
- elog(WARN, "btree indices cannot include null keys");
- */
-
- /* make a copy of the index tuple with room for the sequence number */
- tuplen = IndexTupleSize(itup);
- nbytes_btitem = tuplen +
- (sizeof(BTItemData) - sizeof(IndexTupleData));
-
- btitem = (BTItem) palloc(nbytes_btitem);
- memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
-
+ int nbytes_btitem;
+ BTItem btitem;
+ Size tuplen;
+ extern Oid newoid();
+
+ /*
+ * see comments in btbuild
+ *
+ * if (itup->t_info & INDEX_NULL_MASK) elog(WARN, "btree indices cannot
+ * include null keys");
+ */
+
+ /* make a copy of the index tuple with room for the sequence number */
+ tuplen = IndexTupleSize(itup);
+ nbytes_btitem = tuplen +
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
+
+ btitem = (BTItem) palloc(nbytes_btitem);
+ memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
+
#ifndef BTREE_VERSION_1
- btitem->bti_oid = newoid();
+ btitem->bti_oid = newoid();
#endif
- return (btitem);
+ return (btitem);
}
#ifdef NOT_USED
bool
_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
{
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
- if (so->numberOfKeys > 0)
- return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
- so->numberOfKeys, so->keyData));
- else
- return (true);
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+ if (so->numberOfKeys > 0)
+ return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
+ so->numberOfKeys, so->keyData));
+ else
+ return (true);
}
+
#endif
#ifdef NOT_USED
bool
_bt_checkforkeys(IndexScanDesc scan, IndexTuple itup, Size keysz)
{
- BTScanOpaque so;
-
- so = (BTScanOpaque) scan->opaque;
- if ( keysz > 0 && so->numberOfKeys >= keysz )
- return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
- keysz, so->keyData));
- else
- return (true);
+ BTScanOpaque so;
+
+ so = (BTScanOpaque) scan->opaque;
+ if (keysz > 0 && so->numberOfKeys >= keysz)
+ return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
+ keysz, so->keyData));
+ else
+ return (true);
}
+
#endif
bool
-_bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, Size *keysok)
+_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, Size * keysok)
{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
- Size keysz = so->numberOfKeys;
- TupleDesc tupdesc;
- ScanKey key;
- Datum datum;
- bool isNull;
- int test;
-
- *keysok = 0;
- if ( keysz == 0 )
- return (true);
-
- key = so->keyData;
- tupdesc = RelationGetTupleDescriptor(scan->relation);
-
- IncrIndexProcessed();
-
- while (keysz > 0)
- {
- datum = index_getattr(tuple,
- key[0].sk_attno,
- tupdesc,
- &isNull);
-
- /* btree doesn't support 'A is null' clauses, yet */
- if ( isNull || key[0].sk_flags & SK_ISNULL )
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ Size keysz = so->numberOfKeys;
+ TupleDesc tupdesc;
+ ScanKey key;
+ Datum datum;
+ bool isNull;
+ int test;
+
+ *keysok = 0;
+ if (keysz == 0)
+ return (true);
+
+ key = so->keyData;
+ tupdesc = RelationGetTupleDescriptor(scan->relation);
+
+ IncrIndexProcessed();
+
+ while (keysz > 0)
{
- return (false);
- }
+ datum = index_getattr(tuple,
+ key[0].sk_attno,
+ tupdesc,
+ &isNull);
- if (key[0].sk_flags & SK_COMMUTE) {
- test = (int) (*(key[0].sk_func))
- (DatumGetPointer(key[0].sk_argument),
- datum);
- } else {
- test = (int) (*(key[0].sk_func))
- (datum,
- DatumGetPointer(key[0].sk_argument));
- }
-
- if (!test == !(key[0].sk_flags & SK_NEGATE)) {
- return (false);
+ /* btree doesn't support 'A is null' clauses, yet */
+ if (isNull || key[0].sk_flags & SK_ISNULL)
+ {
+ return (false);
+ }
+
+ if (key[0].sk_flags & SK_COMMUTE)
+ {
+ test = (int) (*(key[0].sk_func))
+ (DatumGetPointer(key[0].sk_argument),
+ datum);
+ }
+ else
+ {
+ test = (int) (*(key[0].sk_func))
+ (datum,
+ DatumGetPointer(key[0].sk_argument));
+ }
+
+ if (!test == !(key[0].sk_flags & SK_NEGATE))
+ {
+ return (false);
+ }
+
+ keysz -= 1;
+ key++;
+ (*keysok)++;
}
-
- keysz -= 1;
- key++;
- (*keysok)++;
- }
-
- return (true);
+
+ return (true);
}