diff options
Diffstat (limited to 'src/backend/access/nbtree')
-rw-r--r-- | src/backend/access/nbtree/nbtcompare.c | 201 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 2923 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 902 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 927 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtscan.c | 267 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 2617 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 1926 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtstrat.c | 156 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 623 |
9 files changed, 5423 insertions, 5119 deletions
diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index f005509be07..0312bbb69d7 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -1,22 +1,22 @@ /*------------------------------------------------------------------------- * * nbtcompare.c-- - * Comparison functions for btree access method. + * Comparison functions for btree access method. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.10 1997/06/11 05:20:05 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.11 1997/09/07 04:38:39 momjian Exp $ * - * NOTES - * These functions are stored in pg_amproc. For each operator class - * defined on btrees, they compute + * NOTES + * These functions are stored in pg_amproc. For each operator class + * defined on btrees, they compute * - * compare(a, b): - * < 0 if a < b, - * = 0 if a == b, - * > 0 if a > b. + * compare(a, b): + * < 0 if a < b, + * = 0 if a == b, + * > 0 if a > b. *------------------------------------------------------------------------- */ @@ -30,168 +30,171 @@ int32 btint2cmp(int16 a, int16 b) { - return ((int32) (a - b)); + return ((int32) (a - b)); } int32 btint4cmp(int32 a, int32 b) { - return (a - b); + return (a - b); } int32 btint24cmp(int16 a, int32 b) { - return (((int32) a) - b); + return (((int32) a) - b); } int32 btint42cmp(int32 a, int16 b) { - return (a - ((int32) b)); + return (a - ((int32) b)); } int32 btfloat4cmp(float32 a, float32 b) { - if (*a > *b) - return (1); - else if (*a == *b) - return (0); - else - return (-1); + if (*a > *b) + return (1); + else if (*a == *b) + return (0); + else + return (-1); } int32 btfloat8cmp(float64 a, float64 b) { - if (*a > *b) - return (1); - else if (*a == *b) - return (0); - else - return (-1); + if (*a > *b) + return (1); + else if (*a == *b) + return (0); + else + return (-1); } int32 btoidcmp(Oid a, Oid b) { - if (a > b) - return (1); - else if (a == b) - return (0); - else - return (-1); + if (a > b) + return (1); + else if (a == b) + return (0); + else + return (-1); } int32 btabstimecmp(AbsoluteTime a, AbsoluteTime b) { - if (AbsoluteTimeIsBefore(a, b)) - return (-1); - else if (AbsoluteTimeIsBefore(b, a)) - return (1); - else - return (0); + if (AbsoluteTimeIsBefore(a, b)) + return (-1); + else if (AbsoluteTimeIsBefore(b, a)) + return (1); + else + return (0); } int32 btcharcmp(char a, char b) { - return ((int32) ((uint8)a - (uint8)b)); + return ((int32) ((uint8) a - (uint8) b)); } int32 btchar2cmp(uint16 a, uint16 b) { - return (strncmp((char *) &a, (char *) &b, 2)); + return (strncmp((char *) &a, (char *) &b, 2)); } int32 btchar4cmp(uint32 a, uint32 b) { - return (strncmp((char *) &a, (char *) &b, 4)); + return (strncmp((char *) &a, (char *) &b, 4)); } int32 btchar8cmp(char *a, char *b) { - return (strncmp(a, b, 8)); + return (strncmp(a, b, 8)); } int32 btchar16cmp(char *a, char *b) { - return (strncmp(a, b, 16)); + return (strncmp(a, b, 16)); } int32 -btnamecmp(NameData *a, NameData *b) +btnamecmp(NameData * a, NameData * b) { - return (strncmp(a->data, b->data, NAMEDATALEN)); + return (strncmp(a->data, b->data, NAMEDATALEN)); } int32 -bttextcmp(struct varlena *a, struct varlena *b) +bttextcmp(struct varlena * a, struct varlena * b) { - int res; - unsigned char *ap, *bp; + int res; + unsigned char *ap, + *bp; #ifdef USE_LOCALE - int la = VARSIZE(a) - VARHDRSZ; - int lb = VARSIZE(b) - VARHDRSZ; - - ap = (unsigned char *) palloc (la + 1); - bp = (unsigned char *) palloc (lb + 1); - - memcpy(ap, VARDATA(a), la); - *(ap + la) = '\0'; - memcpy(bp, VARDATA(b), lb); - *(bp + lb) = '\0'; - - res = strcoll (ap, bp); - - pfree (ap); - pfree (bp); + int la = VARSIZE(a) - VARHDRSZ; + int lb = VARSIZE(b) - VARHDRSZ; + + ap = (unsigned char *) palloc(la + 1); + bp = (unsigned char *) palloc(lb + 1); + + memcpy(ap, VARDATA(a), la); + *(ap + la) = '\0'; + memcpy(bp, VARDATA(b), lb); + *(bp + lb) = '\0'; + + res = strcoll(ap, bp); + + pfree(ap); + pfree(bp); #else - int len = VARSIZE(a); - - /* len is the length of the shorter of the two strings */ - if ( len > VARSIZE(b) ) - len = VARSIZE(b); - - len -= VARHDRSZ; - - ap = (unsigned char *) VARDATA(a); - bp = (unsigned char *) VARDATA(b); - - /* - * If the two strings differ in the first len bytes, or if they're - * the same in the first len bytes and they're both len bytes long, - * we're done. - */ - - res = 0; - if (len > 0) { - do { - res = (int) (*ap++ - *bp++); - len--; - } while (res == 0 && len != 0); - } + int len = VARSIZE(a); + + /* len is the length of the shorter of the two strings */ + if (len > VARSIZE(b)) + len = VARSIZE(b); + + len -= VARHDRSZ; + + ap = (unsigned char *) VARDATA(a); + bp = (unsigned char *) VARDATA(b); + + /* + * If the two strings differ in the first len bytes, or if they're the + * same in the first len bytes and they're both len bytes long, we're + * done. + */ + + res = 0; + if (len > 0) + { + do + { + res = (int) (*ap++ - *bp++); + len--; + } while (res == 0 && len != 0); + } #endif - - if (res != 0 || VARSIZE(a) == VARSIZE(b)) - return (res); - - /* - * The two strings are the same in the first len bytes, and they - * are of different lengths. - */ - - if (VARSIZE(a) < VARSIZE(b)) - return (-1); - else - return (1); + + if (res != 0 || VARSIZE(a) == VARSIZE(b)) + return (res); + + /* + * The two strings are the same in the first len bytes, and they are + * of different lengths. + */ + + if (VARSIZE(a) < VARSIZE(b)) + return (-1); + else + return (1); } diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 4dfa6fd2558..4bafbc2ddbb 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * btinsert.c-- - * Item insertion in Lehman and Yao btrees for Postgres. + * Item insertion in Lehman and Yao btrees for Postgres. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.17 1997/08/20 14:53:15 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.18 1997/09/07 04:38:45 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -22,1386 +22,1437 @@ #include <fmgr.h> #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem); -static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright); +static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright); static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit); -static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); +static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf); static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem); -static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem); -static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, BTItem oldItem, BTItem newItem); -static bool _bt_isequal (TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); +static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem); +static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, BTItem oldItem, BTItem newItem); +static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); /* - * _bt_doinsert() -- Handle insertion of a single btitem in the tree. + * _bt_doinsert() -- Handle insertion of a single btitem in the tree. * - * This routine is called by the public interface routines, btbuild - * and btinsert. By here, btitem is filled in, and has a unique - * (xid, seqno) pair. + * This routine is called by the public interface routines, btbuild + * and btinsert. By here, btitem is filled in, and has a unique + * (xid, seqno) pair. */ InsertIndexResult _bt_doinsert(Relation rel, BTItem btitem, bool index_is_unique, Relation heapRel) { - ScanKey itup_scankey; - IndexTuple itup; - BTStack stack; - Buffer buf; - BlockNumber blkno; - int natts = rel->rd_rel->relnatts; - InsertIndexResult res; - - itup = &(btitem->bti_itup); - - /* we need a scan key to do our search, so build one */ - itup_scankey = _bt_mkscankey(rel, itup); - - /* find the page containing this key */ - stack = _bt_search(rel, natts, itup_scankey, &buf); - - blkno = BufferGetBlockNumber(buf); - - /* trade in our read lock for a write lock */ - _bt_relbuf(rel, buf, BT_READ); - buf = _bt_getbuf(rel, blkno, BT_WRITE); - - /* - * If the page was split between the time that we surrendered our - * read lock and acquired our write lock, then this page may no - * longer be the right place for the key we want to insert. In this - * case, we need to move right in the tree. See Lehman and Yao for - * an excruciatingly precise description. - */ - - buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE); - - /* if we're not allowing duplicates, make sure the key isn't */ - /* already in the node */ - if ( index_is_unique ) - { - OffsetNumber offset, maxoff; - Page page; + ScanKey itup_scankey; + IndexTuple itup; + BTStack stack; + Buffer buf; + BlockNumber blkno; + int natts = rel->rd_rel->relnatts; + InsertIndexResult res; - page = BufferGetPage(buf); - maxoff = PageGetMaxOffsetNumber (page); + itup = &(btitem->bti_itup); + + /* we need a scan key to do our search, so build one */ + itup_scankey = _bt_mkscankey(rel, itup); + + /* find the page containing this key */ + stack = _bt_search(rel, natts, itup_scankey, &buf); - offset = _bt_binsrch(rel, buf, natts, itup_scankey, BT_DESCENT); + blkno = BufferGetBlockNumber(buf); - /* make sure the offset we're given points to an actual */ - /* key on the page before trying to compare it */ - if ( !PageIsEmpty (page) && offset <= maxoff ) + /* trade in our read lock for a write lock */ + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our read + * lock and acquired our write lock, then this page may no longer be + * the right place for the key we want to insert. In this case, we + * need to move right in the tree. See Lehman and Yao for an + * excruciatingly precise description. + */ + + buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE); + + /* if we're not allowing duplicates, make sure the key isn't */ + /* already in the node */ + if (index_is_unique) { - TupleDesc itupdesc; - BTItem btitem; - IndexTuple itup; - HeapTuple htup; - BTPageOpaque opaque; - Buffer nbuf; - BlockNumber blkno; - - itupdesc = RelationGetTupleDescriptor(rel); - nbuf = InvalidBuffer; - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* - * _bt_compare returns 0 for (1,NULL) and (1,NULL) - - * this's how we handling NULLs - and so we must not use - * _bt_compare in real comparison, but only for - * ordering/finding items on pages. - vadim 03/24/97 - - while ( !_bt_compare (rel, itupdesc, page, - natts, itup_scankey, offset) ) - */ - while ( _bt_isequal (itupdesc, page, offset, natts, itup_scankey) ) - { /* they're equal */ - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offset)); - itup = &(btitem->bti_itup); - htup = heap_fetch (heapRel, SelfTimeQual, &(itup->t_tid), NULL); - if ( htup != (HeapTuple) NULL ) - { /* it is a duplicate */ - elog(WARN, "Cannot insert a duplicate key into a unique index."); - } - /* get next offnum */ - if ( offset < maxoff ) - { - offset = OffsetNumberNext(offset); - } - else - { /* move right ? */ - if ( P_RIGHTMOST (opaque) ) - break; - if ( !_bt_isequal (itupdesc, page, P_HIKEY, - natts, itup_scankey) ) - break; - /* - * min key of the right page is the same, - * ooh - so many dead duplicates... - */ - blkno = opaque->btpo_next; - if ( nbuf != InvalidBuffer ) - _bt_relbuf (rel, nbuf, BT_READ); - for (nbuf = InvalidBuffer; ; ) - { - nbuf = _bt_getbuf (rel, blkno, BT_READ); - page = BufferGetPage (nbuf); - maxoff = PageGetMaxOffsetNumber(page); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - offset = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - if ( ! PageIsEmpty (page) && offset <= maxoff ) - { /* Found some key */ - break; - } - else - { /* Empty or "pseudo"-empty page - get next */ - blkno = opaque->btpo_next; - _bt_relbuf (rel, nbuf, BT_READ); - nbuf = InvalidBuffer; - if ( blkno == P_NONE ) - break; + OffsetNumber offset, + maxoff; + Page page; + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + + offset = _bt_binsrch(rel, buf, natts, itup_scankey, BT_DESCENT); + + /* make sure the offset we're given points to an actual */ + /* key on the page before trying to compare it */ + if (!PageIsEmpty(page) && offset <= maxoff) + { + TupleDesc itupdesc; + BTItem btitem; + IndexTuple itup; + HeapTuple htup; + BTPageOpaque opaque; + Buffer nbuf; + BlockNumber blkno; + + itupdesc = RelationGetTupleDescriptor(rel); + nbuf = InvalidBuffer; + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* + * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's + * how we handling NULLs - and so we must not use _bt_compare + * in real comparison, but only for ordering/finding items on + * pages. - vadim 03/24/97 + * + * while ( !_bt_compare (rel, itupdesc, page, natts, + * itup_scankey, offset) ) + */ + while (_bt_isequal(itupdesc, page, offset, natts, itup_scankey)) + { /* they're equal */ + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offset)); + itup = &(btitem->bti_itup); + htup = heap_fetch(heapRel, SelfTimeQual, &(itup->t_tid), NULL); + if (htup != (HeapTuple) NULL) + { /* it is a duplicate */ + elog(WARN, "Cannot insert a duplicate key into a unique index."); + } + /* get next offnum */ + if (offset < maxoff) + { + offset = OffsetNumberNext(offset); + } + else + { /* move right ? */ + if (P_RIGHTMOST(opaque)) + break; + if (!_bt_isequal(itupdesc, page, P_HIKEY, + natts, itup_scankey)) + break; + + /* + * min key of the right page is the same, ooh - so + * many dead duplicates... + */ + blkno = opaque->btpo_next; + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf, BT_READ); + for (nbuf = InvalidBuffer;;) + { + nbuf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(nbuf); + maxoff = PageGetMaxOffsetNumber(page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offset = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + if (!PageIsEmpty(page) && offset <= maxoff) + { /* Found some key */ + break; + } + else + { /* Empty or "pseudo"-empty page - get next */ + blkno = opaque->btpo_next; + _bt_relbuf(rel, nbuf, BT_READ); + nbuf = InvalidBuffer; + if (blkno == P_NONE) + break; + } + } + if (nbuf == InvalidBuffer) + break; + } } - } - if ( nbuf == InvalidBuffer ) - break; - } - } - if ( nbuf != InvalidBuffer ) - _bt_relbuf(rel, nbuf, BT_READ); + if (nbuf != InvalidBuffer) + _bt_relbuf(rel, nbuf, BT_READ); + } } - } - - /* do the insertion */ - res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, - btitem, (BTItem) NULL); - - /* be tidy */ - _bt_freestack(stack); - _bt_freeskey(itup_scankey); - - return (res); + + /* do the insertion */ + res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey, + btitem, (BTItem) NULL); + + /* be tidy */ + _bt_freestack(stack); + _bt_freeskey(itup_scankey); + + return (res); } /* - * _bt_insertonpg() -- Insert a tuple on a particular page in the index. + * _bt_insertonpg() -- Insert a tuple on a particular page in the index. * - * This recursive procedure does the following things: + * This recursive procedure does the following things: * - * + if necessary, splits the target page. - * + finds the right place to insert the tuple (taking into - * account any changes induced by a split). - * + inserts the tuple. - * + if the page was split, pops the parent stack, and finds the - * right place to insert the new child pointer (by walking - * right using information stored in the parent stack). - * + invoking itself with the appropriate tuple for the right - * child page on the parent. + * + if necessary, splits the target page. + * + finds the right place to insert the tuple (taking into + * account any changes induced by a split). + * + inserts the tuple. + * + if the page was split, pops the parent stack, and finds the + * right place to insert the new child pointer (by walking + * right using information stored in the parent stack). + * + invoking itself with the appropriate tuple for the right + * child page on the parent. * - * On entry, we must have the right buffer on which to do the - * insertion, and the buffer must be pinned and locked. On return, - * we will have dropped both the pin and the write lock on the buffer. + * On entry, we must have the right buffer on which to do the + * insertion, and the buffer must be pinned and locked. On return, + * we will have dropped both the pin and the write lock on the buffer. * - * The locking interactions in this code are critical. You should - * grok Lehman and Yao's paper before making any changes. In addition, - * you need to understand how we disambiguate duplicate keys in this - * implementation, in order to be able to find our location using - * L&Y "move right" operations. Since we may insert duplicate user - * keys, and since these dups may propogate up the tree, we use the - * 'afteritem' parameter to position ourselves correctly for the - * insertion on internal pages. + * The locking interactions in this code are critical. You should + * grok Lehman and Yao's paper before making any changes. In addition, + * you need to understand how we disambiguate duplicate keys in this + * implementation, in order to be able to find our location using + * L&Y "move right" operations. Since we may insert duplicate user + * keys, and since these dups may propogate up the tree, we use the + * 'afteritem' parameter to position ourselves correctly for the + * insertion on internal pages. */ -static InsertIndexResult +static InsertIndexResult _bt_insertonpg(Relation rel, - Buffer buf, - BTStack stack, - int keysz, - ScanKey scankey, - BTItem btitem, - BTItem afteritem) + Buffer buf, + BTStack stack, + int keysz, + ScanKey scankey, + BTItem btitem, + BTItem afteritem) { - InsertIndexResult res; - Page page; - BTPageOpaque lpageop; - BlockNumber itup_blkno; - OffsetNumber itup_off; - OffsetNumber firstright = InvalidOffsetNumber; - int itemsz; - bool do_split = false; - bool keys_equal = false; - - page = BufferGetPage(buf); - lpageop = (BTPageOpaque) PageGetSpecialPointer(page); - - itemsz = IndexTupleDSize(btitem->bti_itup) - + (sizeof(BTItemData) - sizeof(IndexTupleData)); - - itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this - but we need to be consistent */ - /* - * If we have to insert item on the leftmost page which is the first - * page in the chain of duplicates then: - * 1. if scankey == hikey (i.e. - new duplicate item) then - * insert it here; - * 2. if scankey < hikey then: - * 2.a if there is duplicate key(s) here - we force splitting; - * 2.b else - we may "eat" this page from duplicates chain. - */ - if ( lpageop->btpo_flags & BTP_CHAIN ) - { - OffsetNumber maxoff = PageGetMaxOffsetNumber (page); - ItemId hitemid; - BTItem hitem; - - Assert ( !P_RIGHTMOST(lpageop) ); - hitemid = PageGetItemId(page, P_HIKEY); - hitem = (BTItem) PageGetItem(page, hitemid); - if ( maxoff > P_HIKEY && - !_bt_itemcmp (rel, keysz, hitem, - (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)), - BTEqualStrategyNumber) ) - elog (FATAL, "btree: bad key on the page in the chain of duplicates"); - - if ( !_bt_skeycmp (rel, keysz, scankey, page, hitemid, - BTEqualStrategyNumber) ) - { - if ( !P_LEFTMOST(lpageop) ) - elog (FATAL, "btree: attempt to insert bad key on the non-leftmost page in the chain of duplicates"); - if ( !_bt_skeycmp (rel, keysz, scankey, page, hitemid, - BTLessStrategyNumber) ) - elog (FATAL, "btree: attempt to insert higher key on the leftmost page in the chain of duplicates"); - if ( maxoff > P_HIKEY ) /* have duplicate(s) */ - { - firstright = P_FIRSTKEY; - do_split = true; - } - else /* "eat" page */ - { - Buffer pbuf; - Page ppage; - - itup_blkno = BufferGetBlockNumber(buf); - itup_off = PageAddItem(page, (Item) btitem, itemsz, - P_FIRSTKEY, LP_USED); - if ( itup_off == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item"); - lpageop->btpo_flags &= ~BTP_CHAIN; - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - ppage = BufferGetPage(pbuf); - PageIndexTupleDelete(ppage, stack->bts_offset); - pfree(stack->bts_btitem); - stack->bts_btitem = _bt_formitem(&(btitem->bti_itup)); - ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), - itup_blkno, P_HIKEY); - _bt_wrtbuf(rel, buf); - res = _bt_insertonpg(rel, pbuf, stack->bts_parent, - keysz, scankey, stack->bts_btitem, - NULL); - ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); - return (res); - } - } - else - { - keys_equal = true; - if ( PageGetFreeSpace(page) < itemsz ) - do_split = true; - } - } - else if ( PageGetFreeSpace(page) < itemsz ) - do_split = true; - else if ( PageGetFreeSpace(page) < 3*itemsz + 2*sizeof(ItemIdData) ) - { - OffsetNumber offnum = (P_RIGHTMOST(lpageop)) ? P_HIKEY : P_FIRSTKEY; - OffsetNumber maxoff = PageGetMaxOffsetNumber (page); - ItemId itid; - BTItem previtem, chkitem; - Size maxsize; - Size currsize; - - itid = PageGetItemId(page, offnum); - previtem = (BTItem) PageGetItem(page, itid); - maxsize = currsize = (ItemIdGetLength(itid) + sizeof(ItemIdData)); - for (offnum = OffsetNumberNext(offnum); - offnum <= maxoff; offnum = OffsetNumberNext(offnum) ) - { - itid = PageGetItemId(page, offnum); - chkitem = (BTItem) PageGetItem(page, itid); - if ( !_bt_itemcmp (rel, keysz, previtem, chkitem, - BTEqualStrategyNumber) ) - { - if ( currsize > maxsize ) - maxsize = currsize; - currsize = 0; - previtem = chkitem; - } - currsize += (ItemIdGetLength(itid) + sizeof(ItemIdData)); - } - if ( currsize > maxsize ) - maxsize = currsize; - maxsize += sizeof (PageHeaderData) + - DOUBLEALIGN (sizeof (BTPageOpaqueData)); - if ( maxsize >= PageGetPageSize (page) / 2 ) - do_split = true; - } - - if ( do_split ) - { - Buffer rbuf; - Page rpage; - BTItem ritem; - BlockNumber rbknum; - BTPageOpaque rpageop; - Buffer pbuf; - Page ppage; - BTPageOpaque ppageop; - BlockNumber bknum = BufferGetBlockNumber(buf); - BTItem lowLeftItem; - OffsetNumber maxoff; - bool shifted = false; - bool left_chained = ( lpageop->btpo_flags & BTP_CHAIN ) ? true : false; - - /* - * If we have to split leaf page in the chain of duplicates by - * new duplicate then we try to look at our right sibling first. - */ - if ( ( lpageop->btpo_flags & BTP_CHAIN ) && - ( lpageop->btpo_flags & BTP_LEAF ) && keys_equal ) - { - bool use_left = true; - - rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); - rpage = BufferGetPage(rbuf); - rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); - if ( !P_RIGHTMOST (rpageop) ) /* non-rightmost page */ - { /* - * If we have the same hikey here then it's - * yet another page in chain. - */ - if ( _bt_skeycmp (rel, keysz, scankey, rpage, - PageGetItemId(rpage, P_HIKEY), - BTEqualStrategyNumber) ) - { - if ( !( rpageop->btpo_flags & BTP_CHAIN ) ) - elog (FATAL, "btree: lost page in the chain of duplicates"); - } - else if ( _bt_skeycmp (rel, keysz, scankey, rpage, - PageGetItemId(rpage, P_HIKEY), - BTGreaterStrategyNumber) ) - elog (FATAL, "btree: hikey is out of order"); - else if ( rpageop->btpo_flags & BTP_CHAIN ) - /* - * If hikey > scankey then it's last page in chain and - * BTP_CHAIN must be OFF - */ - elog (FATAL, "btree: lost last page in the chain of duplicates"); - - /* if there is room here then we use this page. */ - if ( PageGetFreeSpace (rpage) > itemsz ) - use_left = false; - } - else /* rightmost page */ - { - Assert ( !( rpageop->btpo_flags & BTP_CHAIN ) ); - /* if there is room here then we use this page. */ - if ( PageGetFreeSpace (rpage) > itemsz ) - use_left = false; - } - if ( !use_left ) /* insert on the right page */ - { - _bt_relbuf(rel, buf, BT_WRITE); - return ( _bt_insertonpg(rel, rbuf, stack, keysz, - scankey, btitem, afteritem) ); - } - _bt_relbuf(rel, rbuf, BT_WRITE); - } + InsertIndexResult res; + Page page; + BTPageOpaque lpageop; + BlockNumber itup_blkno; + OffsetNumber itup_off; + OffsetNumber firstright = InvalidOffsetNumber; + int itemsz; + bool do_split = false; + bool keys_equal = false; + + page = BufferGetPage(buf); + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + + itemsz = IndexTupleDSize(btitem->bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + + itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do + * this but we need to be + * consistent */ + /* - * If after splitting un-chained page we'll got chain of pages - * with duplicates then we want to know - * 1. on which of two pages new btitem will go (current - * _bt_findsplitloc is quite bad); - * 2. what parent (if there's one) thinking about it - * (remember about deletions) + * If we have to insert item on the leftmost page which is the first + * page in the chain of duplicates then: 1. if scankey == hikey (i.e. + * - new duplicate item) then insert it here; 2. if scankey < hikey + * then: 2.a if there is duplicate key(s) here - we force splitting; + * 2.b else - we may "eat" this page from duplicates chain. */ - else if ( !( lpageop->btpo_flags & BTP_CHAIN ) ) + if (lpageop->btpo_flags & BTP_CHAIN) { - OffsetNumber start = ( P_RIGHTMOST(lpageop) ) ? P_HIKEY : P_FIRSTKEY; - Size llimit; - - maxoff = PageGetMaxOffsetNumber (page); - llimit = PageGetPageSize(page) - sizeof (PageHeaderData) - - DOUBLEALIGN (sizeof (BTPageOpaqueData)) - + sizeof(ItemIdData); - llimit /= 2; - firstright = _bt_findsplitloc(rel, page, start, maxoff, llimit); - - if ( _bt_itemcmp (rel, keysz, - (BTItem) PageGetItem(page, PageGetItemId(page, start)), - (BTItem) PageGetItem(page, PageGetItemId(page, firstright)), - BTEqualStrategyNumber) ) - { - if ( _bt_skeycmp (rel, keysz, scankey, page, - PageGetItemId(page, firstright), - BTLessStrategyNumber) ) - /* - * force moving current items to the new page: - * new item will go on the current page. - */ - firstright = start; - else - /* - * new btitem >= firstright, start item == firstright - - * new chain of duplicates: if this non-leftmost leaf - * page and parent item < start item then force moving - * all items to the new page - current page will be - * "empty" after it. - */ - { - if ( !P_LEFTMOST (lpageop) && - ( lpageop->btpo_flags & BTP_LEAF ) ) - { - ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), - bknum, P_HIKEY); - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - if ( _bt_itemcmp (rel, keysz, stack->bts_btitem, - (BTItem) PageGetItem(page, - PageGetItemId(page, start)), - BTLessStrategyNumber) ) - { - firstright = start; - shifted = true; - } - _bt_relbuf(rel, pbuf, BT_WRITE); - } - } - } /* else - no new chain if start item < firstright one */ - } - - /* split the buffer into left and right halves */ - rbuf = _bt_split(rel, buf, firstright); - - /* which new page (left half or right half) gets the tuple? */ - if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) { - /* left page */ - itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, - itemsz, btitem, afteritem); - itup_blkno = BufferGetBlockNumber(buf); - } else { - /* right page */ - itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey, - itemsz, btitem, afteritem); - itup_blkno = BufferGetBlockNumber(rbuf); + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + ItemId hitemid; + BTItem hitem; + + Assert(!P_RIGHTMOST(lpageop)); + hitemid = PageGetItemId(page, P_HIKEY); + hitem = (BTItem) PageGetItem(page, hitemid); + if (maxoff > P_HIKEY && + !_bt_itemcmp(rel, keysz, hitem, + (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)), + BTEqualStrategyNumber)) + elog(FATAL, "btree: bad key on the page in the chain of duplicates"); + + if (!_bt_skeycmp(rel, keysz, scankey, page, hitemid, + BTEqualStrategyNumber)) + { + if (!P_LEFTMOST(lpageop)) + elog(FATAL, "btree: attempt to insert bad key on the non-leftmost page in the chain of duplicates"); + if (!_bt_skeycmp(rel, keysz, scankey, page, hitemid, + BTLessStrategyNumber)) + elog(FATAL, "btree: attempt to insert higher key on the leftmost page in the chain of duplicates"); + if (maxoff > P_HIKEY) /* have duplicate(s) */ + { + firstright = P_FIRSTKEY; + do_split = true; + } + else +/* "eat" page */ + { + Buffer pbuf; + Page ppage; + + itup_blkno = BufferGetBlockNumber(buf); + itup_off = PageAddItem(page, (Item) btitem, itemsz, + P_FIRSTKEY, LP_USED); + if (itup_off == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item"); + lpageop->btpo_flags &= ~BTP_CHAIN; + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + ppage = BufferGetPage(pbuf); + PageIndexTupleDelete(ppage, stack->bts_offset); + pfree(stack->bts_btitem); + stack->bts_btitem = _bt_formitem(&(btitem->bti_itup)); + ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), + itup_blkno, P_HIKEY); + _bt_wrtbuf(rel, buf); + res = _bt_insertonpg(rel, pbuf, stack->bts_parent, + keysz, scankey, stack->bts_btitem, + NULL); + ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); + return (res); + } + } + else + { + keys_equal = true; + if (PageGetFreeSpace(page) < itemsz) + do_split = true; + } } - - maxoff = PageGetMaxOffsetNumber (page); - if ( shifted ) - { - if ( maxoff > P_FIRSTKEY ) - elog (FATAL, "btree: shifted page is not empty"); - lowLeftItem = (BTItem) NULL; - } - else - { - if ( maxoff < P_FIRSTKEY ) - elog (FATAL, "btree: un-shifted page is empty"); - lowLeftItem = (BTItem) PageGetItem(page, - PageGetItemId(page, P_FIRSTKEY)); - if ( _bt_itemcmp (rel, keysz, lowLeftItem, - (BTItem) PageGetItem(page, PageGetItemId(page, P_HIKEY)), - BTEqualStrategyNumber) ) - lpageop->btpo_flags |= BTP_CHAIN; + else if (PageGetFreeSpace(page) < itemsz) + do_split = true; + else if (PageGetFreeSpace(page) < 3 * itemsz + 2 * sizeof(ItemIdData)) + { + OffsetNumber offnum = (P_RIGHTMOST(lpageop)) ? P_HIKEY : P_FIRSTKEY; + OffsetNumber maxoff = PageGetMaxOffsetNumber(page); + ItemId itid; + BTItem previtem, + chkitem; + Size maxsize; + Size currsize; + + itid = PageGetItemId(page, offnum); + previtem = (BTItem) PageGetItem(page, itid); + maxsize = currsize = (ItemIdGetLength(itid) + sizeof(ItemIdData)); + for (offnum = OffsetNumberNext(offnum); + offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + itid = PageGetItemId(page, offnum); + chkitem = (BTItem) PageGetItem(page, itid); + if (!_bt_itemcmp(rel, keysz, previtem, chkitem, + BTEqualStrategyNumber)) + { + if (currsize > maxsize) + maxsize = currsize; + currsize = 0; + previtem = chkitem; + } + currsize += (ItemIdGetLength(itid) + sizeof(ItemIdData)); + } + if (currsize > maxsize) + maxsize = currsize; + maxsize += sizeof(PageHeaderData) + + DOUBLEALIGN(sizeof(BTPageOpaqueData)); + if (maxsize >= PageGetPageSize(page) / 2) + do_split = true; } - /* - * By here, - * - * + our target page has been split; - * + the original tuple has been inserted; - * + we have write locks on both the old (left half) and new - * (right half) buffers, after the split; and - * + we have the key we want to insert into the parent. - * - * Do the parent insertion. We need to hold onto the locks for - * the child pages until we locate the parent, but we can release - * them before doing the actual insertion (see Lehman and Yao for - * the reasoning). - */ - - if (stack == (BTStack) NULL) { - - /* create a new root node and release the split buffers */ - _bt_newroot(rel, buf, rbuf); - _bt_relbuf(rel, buf, BT_WRITE); - _bt_relbuf(rel, rbuf, BT_WRITE); - - } else { - ScanKey newskey; - InsertIndexResult newres; - BTItem new_item; - OffsetNumber upditem_offset = P_HIKEY; - bool do_update = false; - bool update_in_place = true; - bool parent_chained; - - /* form a index tuple that points at the new right page */ - rbknum = BufferGetBlockNumber(rbuf); - rpage = BufferGetPage(rbuf); - rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); - - /* - * By convention, the first entry (1) on every - * non-rightmost page is the high key for that page. In - * order to get the lowest key on the new right page, we - * actually look at its second (2) entry. - */ - - if (! P_RIGHTMOST(rpageop)) - { - ritem = (BTItem) PageGetItem(rpage, - PageGetItemId(rpage, P_FIRSTKEY)); - if ( _bt_itemcmp (rel, keysz, ritem, - (BTItem) PageGetItem(rpage, - PageGetItemId(rpage, P_HIKEY)), - BTEqualStrategyNumber) ) - rpageop->btpo_flags |= BTP_CHAIN; - } - else - ritem = (BTItem) PageGetItem(rpage, - PageGetItemId(rpage, P_HIKEY)); - - /* get a unique btitem for this key */ - new_item = _bt_formitem(&(ritem->bti_itup)); - - ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY); - - /* - * Find the parent buffer and get the parent page. - * - * Oops - if we were moved right then we need to - * change stack item! We want to find parent pointing to - * where we are, right ? - vadim 05/27/97 - */ - ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), - bknum, P_HIKEY); - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - ppage = BufferGetPage(pbuf); - ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); - parent_chained = (( ppageop->btpo_flags & BTP_CHAIN )) ? true : false; - - if ( parent_chained && !left_chained ) - elog (FATAL, "nbtree: unexpected chained parent of unchained page"); - - /* - * If the key of new_item is < than the key of the item - * in the parent page pointing to the left page - * (stack->bts_btitem), we have to update the latter key; - * otherwise the keys on the parent page wouldn't be - * monotonically increasing after we inserted the new - * pointer to the right page (new_item). This only - * happens if our left page is the leftmost page and a - * new minimum key had been inserted before, which is not - * reflected in the parent page but didn't matter so - * far. If there are duplicate keys and this new minimum - * key spills over to our new right page, we get an - * inconsistency if we don't update the left key in the - * parent page. - * - * Also, new duplicates handling code require us to update - * parent item if some smaller items left on the left page - * (which is possible in splitting leftmost page) and - * current parent item == new_item. - vadim 05/27/97 - */ - if ( _bt_itemcmp (rel, keysz, stack->bts_btitem, new_item, - BTGreaterStrategyNumber) || - ( !shifted && - _bt_itemcmp(rel, keysz, stack->bts_btitem, - new_item, BTEqualStrategyNumber) && - _bt_itemcmp(rel, keysz, lowLeftItem, - new_item, BTLessStrategyNumber) ) ) - { - do_update = true; - /* - * figure out which key is leftmost (if the parent page - * is rightmost, too, it must be the root) + if (do_split) + { + Buffer rbuf; + Page rpage; + BTItem ritem; + BlockNumber rbknum; + BTPageOpaque rpageop; + Buffer pbuf; + Page ppage; + BTPageOpaque ppageop; + BlockNumber bknum = BufferGetBlockNumber(buf); + BTItem lowLeftItem; + OffsetNumber maxoff; + bool shifted = false; + bool left_chained = (lpageop->btpo_flags & BTP_CHAIN) ? true : false; + + /* + * If we have to split leaf page in the chain of duplicates by new + * duplicate then we try to look at our right sibling first. */ - if(P_RIGHTMOST(ppageop)) - upditem_offset = P_HIKEY; + if ((lpageop->btpo_flags & BTP_CHAIN) && + (lpageop->btpo_flags & BTP_LEAF) && keys_equal) + { + bool use_left = true; + + rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + if (!P_RIGHTMOST(rpageop)) /* non-rightmost page */ + { /* If we have the same hikey here then + * it's yet another page in chain. */ + if (_bt_skeycmp(rel, keysz, scankey, rpage, + PageGetItemId(rpage, P_HIKEY), + BTEqualStrategyNumber)) + { + if (!(rpageop->btpo_flags & BTP_CHAIN)) + elog(FATAL, "btree: lost page in the chain of duplicates"); + } + else if (_bt_skeycmp(rel, keysz, scankey, rpage, + PageGetItemId(rpage, P_HIKEY), + BTGreaterStrategyNumber)) + elog(FATAL, "btree: hikey is out of order"); + else if (rpageop->btpo_flags & BTP_CHAIN) + + /* + * If hikey > scankey then it's last page in chain and + * BTP_CHAIN must be OFF + */ + elog(FATAL, "btree: lost last page in the chain of duplicates"); + + /* if there is room here then we use this page. */ + if (PageGetFreeSpace(rpage) > itemsz) + use_left = false; + } + else +/* rightmost page */ + { + Assert(!(rpageop->btpo_flags & BTP_CHAIN)); + /* if there is room here then we use this page. */ + if (PageGetFreeSpace(rpage) > itemsz) + use_left = false; + } + if (!use_left) /* insert on the right page */ + { + _bt_relbuf(rel, buf, BT_WRITE); + return (_bt_insertonpg(rel, rbuf, stack, keysz, + scankey, btitem, afteritem)); + } + _bt_relbuf(rel, rbuf, BT_WRITE); + } + + /* + * If after splitting un-chained page we'll got chain of pages + * with duplicates then we want to know 1. on which of two pages + * new btitem will go (current _bt_findsplitloc is quite bad); 2. + * what parent (if there's one) thinking about it (remember about + * deletions) + */ + else if (!(lpageop->btpo_flags & BTP_CHAIN)) + { + OffsetNumber start = (P_RIGHTMOST(lpageop)) ? P_HIKEY : P_FIRSTKEY; + Size llimit; + + maxoff = PageGetMaxOffsetNumber(page); + llimit = PageGetPageSize(page) - sizeof(PageHeaderData) - + DOUBLEALIGN(sizeof(BTPageOpaqueData)) + + sizeof(ItemIdData); + llimit /= 2; + firstright = _bt_findsplitloc(rel, page, start, maxoff, llimit); + + if (_bt_itemcmp(rel, keysz, + (BTItem) PageGetItem(page, PageGetItemId(page, start)), + (BTItem) PageGetItem(page, PageGetItemId(page, firstright)), + BTEqualStrategyNumber)) + { + if (_bt_skeycmp(rel, keysz, scankey, page, + PageGetItemId(page, firstright), + BTLessStrategyNumber)) + + /* + * force moving current items to the new page: new + * item will go on the current page. + */ + firstright = start; + else + + /* + * new btitem >= firstright, start item == firstright + * - new chain of duplicates: if this non-leftmost + * leaf page and parent item < start item then force + * moving all items to the new page - current page + * will be "empty" after it. + */ + { + if (!P_LEFTMOST(lpageop) && + (lpageop->btpo_flags & BTP_LEAF)) + { + ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), + bknum, P_HIKEY); + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + if (_bt_itemcmp(rel, keysz, stack->bts_btitem, + (BTItem) PageGetItem(page, + PageGetItemId(page, start)), + BTLessStrategyNumber)) + { + firstright = start; + shifted = true; + } + _bt_relbuf(rel, pbuf, BT_WRITE); + } + } + } /* else - no new chain if start item < + * firstright one */ + } + + /* split the buffer into left and right halves */ + rbuf = _bt_split(rel, buf, firstright); + + /* which new page (left half or right half) gets the tuple? */ + if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) + { + /* left page */ + itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(buf); + } else - upditem_offset = P_FIRSTKEY; - if ( !P_LEFTMOST(lpageop) || - stack->bts_offset != upditem_offset ) - elog (FATAL, "btree: items are out of order (leftmost %d, stack %u, update %u)", - P_LEFTMOST(lpageop), stack->bts_offset, upditem_offset); - } - - if ( do_update ) - { - if ( shifted ) - elog (FATAL, "btree: attempt to update parent for shifted page"); - /* - * Try to update in place. If out parent page is chained - * then we must forse insertion. + { + /* right page */ + itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(rbuf); + } + + maxoff = PageGetMaxOffsetNumber(page); + if (shifted) + { + if (maxoff > P_FIRSTKEY) + elog(FATAL, "btree: shifted page is not empty"); + lowLeftItem = (BTItem) NULL; + } + else + { + if (maxoff < P_FIRSTKEY) + elog(FATAL, "btree: un-shifted page is empty"); + lowLeftItem = (BTItem) PageGetItem(page, + PageGetItemId(page, P_FIRSTKEY)); + if (_bt_itemcmp(rel, keysz, lowLeftItem, + (BTItem) PageGetItem(page, PageGetItemId(page, P_HIKEY)), + BTEqualStrategyNumber)) + lpageop->btpo_flags |= BTP_CHAIN; + } + + /* + * By here, + * + * + our target page has been split; + the original tuple has been + * inserted; + we have write locks on both the old (left half) + * and new (right half) buffers, after the split; and + we have + * the key we want to insert into the parent. + * + * Do the parent insertion. We need to hold onto the locks for the + * child pages until we locate the parent, but we can release them + * before doing the actual insertion (see Lehman and Yao for the + * reasoning). */ - if ( !parent_chained && - DOUBLEALIGN (IndexTupleDSize (lowLeftItem->bti_itup)) == - DOUBLEALIGN (IndexTupleDSize (stack->bts_btitem->bti_itup)) ) - { - _bt_updateitem(rel, keysz, pbuf, - stack->bts_btitem, lowLeftItem); - _bt_relbuf(rel, buf, BT_WRITE); - _bt_relbuf(rel, rbuf, BT_WRITE); + + if (stack == (BTStack) NULL) + { + + /* create a new root node and release the split buffers */ + _bt_newroot(rel, buf, rbuf); + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + } else { - update_in_place = false; - PageIndexTupleDelete(ppage, upditem_offset); - - /* - * don't write anything out yet--we still have the write - * lock, and now we call another _bt_insertonpg to - * insert the correct key. - * First, make a new item, using the tuple data from - * lowLeftItem. Point it to the left child. - * Update it on the stack at the same time. - */ - pfree(stack->bts_btitem); - stack->bts_btitem = _bt_formitem(&(lowLeftItem->bti_itup)); - ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), - bknum, P_HIKEY); - - /* - * Unlock the children before doing this - * - * Mmm ... I foresee problems here. - vadim 06/10/97 - */ - _bt_relbuf(rel, buf, BT_WRITE); - _bt_relbuf(rel, rbuf, BT_WRITE); - - /* - * A regular _bt_binsrch should find the right place to - * put the new entry, since it should be lower than any - * other key on the page. - * Therefore set afteritem to NULL. - */ - newskey = _bt_mkscankey(rel, &(stack->bts_btitem->bti_itup)); - newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, - keysz, newskey, stack->bts_btitem, - NULL); - - pfree(newres); - pfree(newskey); - - /* - * we have now lost our lock on the parent buffer, and - * need to get it back. - */ - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + ScanKey newskey; + InsertIndexResult newres; + BTItem new_item; + OffsetNumber upditem_offset = P_HIKEY; + bool do_update = false; + bool update_in_place = true; + bool parent_chained; + + /* form a index tuple that points at the new right page */ + rbknum = BufferGetBlockNumber(rbuf); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* + * By convention, the first entry (1) on every non-rightmost + * page is the high key for that page. In order to get the + * lowest key on the new right page, we actually look at its + * second (2) entry. + */ + + if (!P_RIGHTMOST(rpageop)) + { + ritem = (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_FIRSTKEY)); + if (_bt_itemcmp(rel, keysz, ritem, + (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_HIKEY)), + BTEqualStrategyNumber)) + rpageop->btpo_flags |= BTP_CHAIN; + } + else + ritem = (BTItem) PageGetItem(rpage, + PageGetItemId(rpage, P_HIKEY)); + + /* get a unique btitem for this key */ + new_item = _bt_formitem(&(ritem->bti_itup)); + + ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY); + + /* + * Find the parent buffer and get the parent page. + * + * Oops - if we were moved right then we need to change stack + * item! We want to find parent pointing to where we are, + * right ? - vadim 05/27/97 + */ + ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), + bknum, P_HIKEY); + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + ppage = BufferGetPage(pbuf); + ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); + parent_chained = ((ppageop->btpo_flags & BTP_CHAIN)) ? true : false; + + if (parent_chained && !left_chained) + elog(FATAL, "nbtree: unexpected chained parent of unchained page"); + + /* + * If the key of new_item is < than the key of the item in the + * parent page pointing to the left page (stack->bts_btitem), + * we have to update the latter key; otherwise the keys on the + * parent page wouldn't be monotonically increasing after we + * inserted the new pointer to the right page (new_item). This + * only happens if our left page is the leftmost page and a + * new minimum key had been inserted before, which is not + * reflected in the parent page but didn't matter so far. If + * there are duplicate keys and this new minimum key spills + * over to our new right page, we get an inconsistency if we + * don't update the left key in the parent page. + * + * Also, new duplicates handling code require us to update parent + * item if some smaller items left on the left page (which is + * possible in splitting leftmost page) and current parent + * item == new_item. - vadim 05/27/97 + */ + if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item, + BTGreaterStrategyNumber) || + (!shifted && + _bt_itemcmp(rel, keysz, stack->bts_btitem, + new_item, BTEqualStrategyNumber) && + _bt_itemcmp(rel, keysz, lowLeftItem, + new_item, BTLessStrategyNumber))) + { + do_update = true; + + /* + * figure out which key is leftmost (if the parent page is + * rightmost, too, it must be the root) + */ + if (P_RIGHTMOST(ppageop)) + upditem_offset = P_HIKEY; + else + upditem_offset = P_FIRSTKEY; + if (!P_LEFTMOST(lpageop) || + stack->bts_offset != upditem_offset) + elog(FATAL, "btree: items are out of order (leftmost %d, stack %u, update %u)", + P_LEFTMOST(lpageop), stack->bts_offset, upditem_offset); + } + + if (do_update) + { + if (shifted) + elog(FATAL, "btree: attempt to update parent for shifted page"); + + /* + * Try to update in place. If out parent page is chained + * then we must forse insertion. + */ + if (!parent_chained && + DOUBLEALIGN(IndexTupleDSize(lowLeftItem->bti_itup)) == + DOUBLEALIGN(IndexTupleDSize(stack->bts_btitem->bti_itup))) + { + _bt_updateitem(rel, keysz, pbuf, + stack->bts_btitem, lowLeftItem); + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + } + else + { + update_in_place = false; + PageIndexTupleDelete(ppage, upditem_offset); + + /* + * don't write anything out yet--we still have the + * write lock, and now we call another _bt_insertonpg + * to insert the correct key. First, make a new item, + * using the tuple data from lowLeftItem. Point it to + * the left child. Update it on the stack at the same + * time. + */ + pfree(stack->bts_btitem); + stack->bts_btitem = _bt_formitem(&(lowLeftItem->bti_itup)); + ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), + bknum, P_HIKEY); + + /* + * Unlock the children before doing this + * + * Mmm ... I foresee problems here. - vadim 06/10/97 + */ + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + + /* + * A regular _bt_binsrch should find the right place + * to put the new entry, since it should be lower than + * any other key on the page. Therefore set afteritem + * to NULL. + */ + newskey = _bt_mkscankey(rel, &(stack->bts_btitem->bti_itup)); + newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, + keysz, newskey, stack->bts_btitem, + NULL); + + pfree(newres); + pfree(newskey); + + /* + * we have now lost our lock on the parent buffer, and + * need to get it back. + */ + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + } + } + else + { + _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, rbuf, BT_WRITE); + } + + newskey = _bt_mkscankey(rel, &(new_item->bti_itup)); + + afteritem = stack->bts_btitem; + if (parent_chained && !update_in_place) + { + ppage = BufferGetPage(pbuf); + ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); + if (ppageop->btpo_flags & BTP_CHAIN) + elog(FATAL, "btree: unexpected BTP_CHAIN flag in parent after update"); + if (P_RIGHTMOST(ppageop)) + elog(FATAL, "btree: chained parent is RIGHTMOST after update"); + maxoff = PageGetMaxOffsetNumber(ppage); + if (maxoff != P_FIRSTKEY) + elog(FATAL, "btree: FIRSTKEY was unexpected in parent after update"); + if (_bt_skeycmp(rel, keysz, newskey, ppage, + PageGetItemId(ppage, P_FIRSTKEY), + BTLessEqualStrategyNumber)) + elog(FATAL, "btree: parent FIRSTKEY is >= duplicate key after update"); + if (!_bt_skeycmp(rel, keysz, newskey, ppage, + PageGetItemId(ppage, P_HIKEY), + BTEqualStrategyNumber)) + elog(FATAL, "btree: parent HIGHKEY is not equal duplicate key after update"); + afteritem = (BTItem) NULL; + } + else if (left_chained && !update_in_place) + { + ppage = BufferGetPage(pbuf); + ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); + if (!P_RIGHTMOST(ppageop) && + _bt_skeycmp(rel, keysz, newskey, ppage, + PageGetItemId(ppage, P_HIKEY), + BTGreaterStrategyNumber)) + afteritem = (BTItem) NULL; + } + if (afteritem == (BTItem) NULL) + { + rbuf = _bt_getbuf(rel, ppageop->btpo_next, BT_WRITE); + _bt_relbuf(rel, pbuf, BT_WRITE); + pbuf = rbuf; + } + + newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, + keysz, newskey, new_item, + afteritem); + + /* be tidy */ + pfree(newres); + pfree(newskey); + pfree(new_item); } - } - else - { - _bt_relbuf(rel, buf, BT_WRITE); - _bt_relbuf(rel, rbuf, BT_WRITE); - } - - newskey = _bt_mkscankey(rel, &(new_item->bti_itup)); - - afteritem = stack->bts_btitem; - if ( parent_chained && !update_in_place ) - { - ppage = BufferGetPage(pbuf); - ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); - if ( ppageop->btpo_flags & BTP_CHAIN ) - elog (FATAL, "btree: unexpected BTP_CHAIN flag in parent after update"); - if ( P_RIGHTMOST (ppageop) ) - elog (FATAL, "btree: chained parent is RIGHTMOST after update"); - maxoff = PageGetMaxOffsetNumber (ppage); - if ( maxoff != P_FIRSTKEY ) - elog (FATAL, "btree: FIRSTKEY was unexpected in parent after update"); - if ( _bt_skeycmp (rel, keysz, newskey, ppage, - PageGetItemId(ppage, P_FIRSTKEY), - BTLessEqualStrategyNumber) ) - elog (FATAL, "btree: parent FIRSTKEY is >= duplicate key after update"); - if ( !_bt_skeycmp (rel, keysz, newskey, ppage, - PageGetItemId(ppage, P_HIKEY), - BTEqualStrategyNumber) ) - elog (FATAL, "btree: parent HIGHKEY is not equal duplicate key after update"); - afteritem = (BTItem) NULL; - } - else if ( left_chained && !update_in_place ) - { - ppage = BufferGetPage(pbuf); - ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); - if ( !P_RIGHTMOST (ppageop) && - _bt_skeycmp (rel, keysz, newskey, ppage, - PageGetItemId(ppage, P_HIKEY), - BTGreaterStrategyNumber) ) - afteritem = (BTItem) NULL; - } - if ( afteritem == (BTItem) NULL) - { - rbuf = _bt_getbuf(rel, ppageop->btpo_next, BT_WRITE); - _bt_relbuf(rel, pbuf, BT_WRITE); - pbuf = rbuf; - } - - newres = _bt_insertonpg(rel, pbuf, stack->bts_parent, - keysz, newskey, new_item, - afteritem); - - /* be tidy */ - pfree(newres); - pfree(newskey); - pfree(new_item); } - } else { - itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, - itemsz, btitem, afteritem); - itup_blkno = BufferGetBlockNumber(buf); - - _bt_relbuf(rel, buf, BT_WRITE); - } - - /* by here, the new tuple is inserted */ - res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); - ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); - - return (res); + else + { + itup_off = _bt_pgaddtup(rel, buf, keysz, scankey, + itemsz, btitem, afteritem); + itup_blkno = BufferGetBlockNumber(buf); + + _bt_relbuf(rel, buf, BT_WRITE); + } + + /* by here, the new tuple is inserted */ + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); + + return (res); } /* - * _bt_split() -- split a page in the btree. + * _bt_split() -- split a page in the btree. * - * On entry, buf is the page to split, and is write-locked and pinned. - * Returns the new right sibling of buf, pinned and write-locked. The - * pin and lock on buf are maintained. + * On entry, buf is the page to split, and is write-locked and pinned. + * Returns the new right sibling of buf, pinned and write-locked. The + * pin and lock on buf are maintained. */ -static Buffer +static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright) { - Buffer rbuf; - Page origpage; - Page leftpage, rightpage; - BTPageOpaque ropaque, lopaque, oopaque; - Buffer sbuf; - Page spage; - BTPageOpaque sopaque; - Size itemsz; - ItemId itemid; - BTItem item; - OffsetNumber leftoff, rightoff; - OffsetNumber start; - OffsetNumber maxoff; - OffsetNumber i; - - rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); - origpage = BufferGetPage(buf); - leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData)); - rightpage = BufferGetPage(rbuf); - - _bt_pageinit(rightpage, BufferGetPageSize(rbuf)); - _bt_pageinit(leftpage, BufferGetPageSize(buf)); - - /* init btree private data */ - oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); - lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); - ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); - - /* if we're splitting this page, it won't be the root when we're done */ - oopaque->btpo_flags &= ~BTP_ROOT; - oopaque->btpo_flags &= ~BTP_CHAIN; - lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags; - lopaque->btpo_prev = oopaque->btpo_prev; - ropaque->btpo_prev = BufferGetBlockNumber(buf); - lopaque->btpo_next = BufferGetBlockNumber(rbuf); - ropaque->btpo_next = oopaque->btpo_next; - - /* - * If the page we're splitting is not the rightmost page at its - * level in the tree, then the first (0) entry on the page is the - * high key for the page. We need to copy that to the right - * half. Otherwise (meaning the rightmost page case), we should - * treat the line pointers beginning at zero as user data. - * - * We leave a blank space at the start of the line table for the - * left page. We'll come back later and fill it in with the high - * key item we get from the right key. - */ - - leftoff = P_FIRSTKEY; - ropaque->btpo_next = oopaque->btpo_next; - if (! P_RIGHTMOST(oopaque)) { - /* splitting a non-rightmost page, start at the first data item */ - start = P_FIRSTKEY; - - itemid = PageGetItemId(origpage, P_HIKEY); - itemsz = ItemIdGetLength(itemid); - item = (BTItem) PageGetItem(origpage, itemid); - if ( PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add hikey to the right sibling"); - rightoff = P_FIRSTKEY; - } else { - /* splitting a rightmost page, "high key" is the first data item */ - start = P_HIKEY; - - /* the new rightmost page will not have a high key */ - rightoff = P_HIKEY; - } - maxoff = PageGetMaxOffsetNumber(origpage); - if ( firstright == InvalidOffsetNumber ) - { - Size llimit = PageGetFreeSpace(leftpage) / 2; - firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit); - } - - for (i = start; i <= maxoff; i = OffsetNumberNext(i)) { - itemid = PageGetItemId(origpage, i); + Buffer rbuf; + Page origpage; + Page leftpage, + rightpage; + BTPageOpaque ropaque, + lopaque, + oopaque; + Buffer sbuf; + Page spage; + BTPageOpaque sopaque; + Size itemsz; + ItemId itemid; + BTItem item; + OffsetNumber leftoff, + rightoff; + OffsetNumber start; + OffsetNumber maxoff; + OffsetNumber i; + + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + origpage = BufferGetPage(buf); + leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData)); + rightpage = BufferGetPage(rbuf); + + _bt_pageinit(rightpage, BufferGetPageSize(rbuf)); + _bt_pageinit(leftpage, BufferGetPageSize(buf)); + + /* init btree private data */ + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* if we're splitting this page, it won't be the root when we're done */ + oopaque->btpo_flags &= ~BTP_ROOT; + oopaque->btpo_flags &= ~BTP_CHAIN; + lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_prev = oopaque->btpo_prev; + ropaque->btpo_prev = BufferGetBlockNumber(buf); + lopaque->btpo_next = BufferGetBlockNumber(rbuf); + ropaque->btpo_next = oopaque->btpo_next; + + /* + * If the page we're splitting is not the rightmost page at its level + * in the tree, then the first (0) entry on the page is the high key + * for the page. We need to copy that to the right half. Otherwise + * (meaning the rightmost page case), we should treat the line + * pointers beginning at zero as user data. + * + * We leave a blank space at the start of the line table for the left + * page. We'll come back later and fill it in with the high key item + * we get from the right key. + */ + + leftoff = P_FIRSTKEY; + ropaque->btpo_next = oopaque->btpo_next; + if (!P_RIGHTMOST(oopaque)) + { + /* splitting a non-rightmost page, start at the first data item */ + start = P_FIRSTKEY; + + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(origpage, itemid); + if (PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add hikey to the right sibling"); + rightoff = P_FIRSTKEY; + } + else + { + /* splitting a rightmost page, "high key" is the first data item */ + start = P_HIKEY; + + /* the new rightmost page will not have a high key */ + rightoff = P_HIKEY; + } + maxoff = PageGetMaxOffsetNumber(origpage); + if (firstright == InvalidOffsetNumber) + { + Size llimit = PageGetFreeSpace(leftpage) / 2; + + firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit); + } + + for (i = start; i <= maxoff; i = OffsetNumberNext(i)) + { + itemid = PageGetItemId(origpage, i); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(origpage, itemid); + + /* decide which page to put it on */ + if (i < firstright) + { + if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, + LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the left sibling"); + leftoff = OffsetNumberNext(leftoff); + } + else + { + if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, + LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the right sibling"); + rightoff = OffsetNumberNext(rightoff); + } + } + + /* + * Okay, page has been split, high key on right page is correct. Now + * set the high key on the left page to be the min key on the right + * page. + */ + + if (P_RIGHTMOST(ropaque)) + { + itemid = PageGetItemId(rightpage, P_HIKEY); + } + else + { + itemid = PageGetItemId(rightpage, P_FIRSTKEY); + } itemsz = ItemIdGetLength(itemid); - item = (BTItem) PageGetItem(origpage, itemid); - - /* decide which page to put it on */ - if (i < firstright) { - if ( PageAddItem(leftpage, (Item) item, itemsz, leftoff, - LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the left sibling"); - leftoff = OffsetNumberNext(leftoff); - } else { - if ( PageAddItem(rightpage, (Item) item, itemsz, rightoff, - LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the right sibling"); - rightoff = OffsetNumberNext(rightoff); + item = (BTItem) PageGetItem(rightpage, itemid); + + /* + * We left a hole for the high key on the left page; fill it. The + * modal crap is to tell the page manager to put the new item on the + * page and not screw around with anything else. Whoever designed + * this interface has presumably crawled back into the dung heap they + * came from. No one here will admit to it. + */ + + PageManagerModeSet(OverwritePageManagerMode); + if (PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add hikey to the left sibling"); + PageManagerModeSet(ShufflePageManagerMode); + + /* + * By here, the original data page has been split into two new halves, + * and these are correct. The algorithm requires that the left page + * never move during a split, so we copy the new left page back on top + * of the original. Note that this is not a waste of time, since we + * also require (in the page management code) that the center of a + * page always be clean, and the most efficient way to guarantee this + * is just to compact the data by reinserting it into a new left page. + */ + + PageRestoreTempPage(leftpage, origpage); + + /* write these guys out */ + _bt_wrtnorelbuf(rel, rbuf); + _bt_wrtnorelbuf(rel, buf); + + /* + * Finally, we need to grab the right sibling (if any) and fix the + * prev pointer there. We are guaranteed that this is deadlock-free + * since no other writer will be moving holding a lock on that page + * and trying to move left, and all readers release locks on a page + * before trying to fetch its neighbors. + */ + + if (!P_RIGHTMOST(ropaque)) + { + sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); + spage = BufferGetPage(sbuf); + sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); + sopaque->btpo_prev = BufferGetBlockNumber(rbuf); + + /* write and release the old right sibling */ + _bt_wrtbuf(rel, sbuf); } - } - - /* - * Okay, page has been split, high key on right page is correct. Now - * set the high key on the left page to be the min key on the right - * page. - */ - - if (P_RIGHTMOST(ropaque)) { - itemid = PageGetItemId(rightpage, P_HIKEY); - } else { - itemid = PageGetItemId(rightpage, P_FIRSTKEY); - } - itemsz = ItemIdGetLength(itemid); - item = (BTItem) PageGetItem(rightpage, itemid); - - /* - * We left a hole for the high key on the left page; fill it. The - * modal crap is to tell the page manager to put the new item on the - * page and not screw around with anything else. Whoever designed - * this interface has presumably crawled back into the dung heap they - * came from. No one here will admit to it. - */ - - PageManagerModeSet(OverwritePageManagerMode); - if ( PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add hikey to the left sibling"); - PageManagerModeSet(ShufflePageManagerMode); - - /* - * By here, the original data page has been split into two new halves, - * and these are correct. The algorithm requires that the left page - * never move during a split, so we copy the new left page back on top - * of the original. Note that this is not a waste of time, since we - * also require (in the page management code) that the center of a - * page always be clean, and the most efficient way to guarantee this - * is just to compact the data by reinserting it into a new left page. - */ - - PageRestoreTempPage(leftpage, origpage); - - /* write these guys out */ - _bt_wrtnorelbuf(rel, rbuf); - _bt_wrtnorelbuf(rel, buf); - - /* - * Finally, we need to grab the right sibling (if any) and fix the - * prev pointer there. We are guaranteed that this is deadlock-free - * since no other writer will be moving holding a lock on that page - * and trying to move left, and all readers release locks on a page - * before trying to fetch its neighbors. - */ - - if (! P_RIGHTMOST(ropaque)) { - sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); - spage = BufferGetPage(sbuf); - sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); - sopaque->btpo_prev = BufferGetBlockNumber(rbuf); - - /* write and release the old right sibling */ - _bt_wrtbuf(rel, sbuf); - } - - /* split's done */ - return (rbuf); + + /* split's done */ + return (rbuf); } /* - * _bt_findsplitloc() -- find a safe place to split a page. + * _bt_findsplitloc() -- find a safe place to split a page. * - * In order to guarantee the proper handling of searches for duplicate - * keys, the first duplicate in the chain must either be the first - * item on the page after the split, or the entire chain must be on - * one of the two pages. That is, - * [1 2 2 2 3 4 5] - * must become - * [1] [2 2 2 3 4 5] - * or - * [1 2 2 2] [3 4 5] - * but not - * [1 2 2] [2 3 4 5]. - * However, - * [2 2 2 2 2 3 4] - * may be split as - * [2 2 2 2] [2 3 4]. + * In order to guarantee the proper handling of searches for duplicate + * keys, the first duplicate in the chain must either be the first + * item on the page after the split, or the entire chain must be on + * one of the two pages. That is, + * [1 2 2 2 3 4 5] + * must become + * [1] [2 2 2 3 4 5] + * or + * [1 2 2 2] [3 4 5] + * but not + * [1 2 2] [2 3 4 5]. + * However, + * [2 2 2 2 2 3 4] + * may be split as + * [2 2 2 2] [2 3 4]. */ -static OffsetNumber +static OffsetNumber _bt_findsplitloc(Relation rel, - Page page, - OffsetNumber start, - OffsetNumber maxoff, - Size llimit) + Page page, + OffsetNumber start, + OffsetNumber maxoff, + Size llimit) { - OffsetNumber i; - OffsetNumber saferight; - ItemId nxtitemid, safeitemid; - BTItem safeitem, nxtitem; - Size nbytes; - int natts; - - if ( start >= maxoff ) - elog (FATAL, "btree: cannot split if start (%d) >= maxoff (%d)", - start, maxoff); - natts = rel->rd_rel->relnatts; - saferight = start; - safeitemid = PageGetItemId(page, saferight); - nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData); - safeitem = (BTItem) PageGetItem(page, safeitemid); - - i = OffsetNumberNext(start); - - while (nbytes < llimit) - { - /* check the next item on the page */ - nxtitemid = PageGetItemId(page, i); - nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData)); - nxtitem = (BTItem) PageGetItem(page, nxtitemid); - - /* - * Test against last known safe item: - * if the tuple we're looking at isn't equal to the last safe - * one we saw, then it's our new safe tuple. - */ - if ( !_bt_itemcmp (rel, natts, - safeitem, nxtitem, BTEqualStrategyNumber) ) + OffsetNumber i; + OffsetNumber saferight; + ItemId nxtitemid, + safeitemid; + BTItem safeitem, + nxtitem; + Size nbytes; + int natts; + + if (start >= maxoff) + elog(FATAL, "btree: cannot split if start (%d) >= maxoff (%d)", + start, maxoff); + natts = rel->rd_rel->relnatts; + saferight = start; + safeitemid = PageGetItemId(page, saferight); + nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData); + safeitem = (BTItem) PageGetItem(page, safeitemid); + + i = OffsetNumberNext(start); + + while (nbytes < llimit) { - safeitem = nxtitem; - saferight = i; + /* check the next item on the page */ + nxtitemid = PageGetItemId(page, i); + nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData)); + nxtitem = (BTItem) PageGetItem(page, nxtitemid); + + /* + * Test against last known safe item: if the tuple we're looking + * at isn't equal to the last safe one we saw, then it's our new + * safe tuple. + */ + if (!_bt_itemcmp(rel, natts, + safeitem, nxtitem, BTEqualStrategyNumber)) + { + safeitem = nxtitem; + saferight = i; + } + if (i < maxoff) + i = OffsetNumberNext(i); + else + break; } - if ( i < maxoff ) - i = OffsetNumberNext(i); - else - break; - } - - /* - * If the chain of dups starts at the beginning of the page and extends - * past the halfway mark, we can split it in the middle. - */ - - if (saferight == start) - saferight = i; - - if ( saferight == maxoff && ( maxoff - start ) > 1 ) - saferight = start + ( maxoff - start ) / 2; - - return (saferight); + + /* + * If the chain of dups starts at the beginning of the page and + * extends past the halfway mark, we can split it in the middle. + */ + + if (saferight == start) + saferight = i; + + if (saferight == maxoff && (maxoff - start) > 1) + saferight = start + (maxoff - start) / 2; + + return (saferight); } /* - * _bt_newroot() -- Create a new root page for the index. + * _bt_newroot() -- Create a new root page for the index. * - * We've just split the old root page and need to create a new one. - * In order to do this, we add a new root page to the file, then lock - * the metadata page and update it. This is guaranteed to be deadlock- - * free, because all readers release their locks on the metadata page - * before trying to lock the root, and all writers lock the root before - * trying to lock the metadata page. We have a write lock on the old - * root page, so we have not introduced any cycles into the waits-for - * graph. + * We've just split the old root page and need to create a new one. + * In order to do this, we add a new root page to the file, then lock + * the metadata page and update it. This is guaranteed to be deadlock- + * free, because all readers release their locks on the metadata page + * before trying to lock the root, and all writers lock the root before + * trying to lock the metadata page. We have a write lock on the old + * root page, so we have not introduced any cycles into the waits-for + * graph. * - * On entry, lbuf (the old root) and rbuf (its new peer) are write- - * locked. We don't drop the locks in this routine; that's done by - * the caller. On exit, a new root page exists with entries for the - * two new children. The new root page is neither pinned nor locked. + * On entry, lbuf (the old root) and rbuf (its new peer) are write- + * locked. We don't drop the locks in this routine; that's done by + * the caller. On exit, a new root page exists with entries for the + * two new children. The new root page is neither pinned nor locked. */ static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) { - Buffer rootbuf; - Page lpage, rpage, rootpage; - BlockNumber lbkno, rbkno; - BlockNumber rootbknum; - BTPageOpaque rootopaque; - ItemId itemid; - BTItem item; - Size itemsz; - BTItem new_item; - - /* get a new root page */ - rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); - rootpage = BufferGetPage(rootbuf); - _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); - - /* set btree special data */ - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); - rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; - rootopaque->btpo_flags |= BTP_ROOT; - - /* - * Insert the internal tuple pointers. - */ - - lbkno = BufferGetBlockNumber(lbuf); - rbkno = BufferGetBlockNumber(rbuf); - lpage = BufferGetPage(lbuf); - rpage = BufferGetPage(rbuf); - - /* - * step over the high key on the left page while building the - * left page pointer. - */ - itemid = PageGetItemId(lpage, P_FIRSTKEY); - itemsz = ItemIdGetLength(itemid); - item = (BTItem) PageGetItem(lpage, itemid); - new_item = _bt_formitem(&(item->bti_itup)); - ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY); - - /* - * insert the left page pointer into the new root page. the root - * page is the rightmost page on its level so the "high key" item - * is the first data item. - */ - if ( PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add leftkey to new root page"); - pfree(new_item); - - /* - * the right page is the rightmost page on the second level, so - * the "high key" item is the first data item on that page as well. - */ - itemid = PageGetItemId(rpage, P_HIKEY); - itemsz = ItemIdGetLength(itemid); - item = (BTItem) PageGetItem(rpage, itemid); - new_item = _bt_formitem(&(item->bti_itup)); - ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY); - - /* - * insert the right page pointer into the new root page. - */ - if ( PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add rightkey to new root page"); - pfree(new_item); - - /* write and let go of the root buffer */ - rootbknum = BufferGetBlockNumber(rootbuf); - _bt_wrtbuf(rel, rootbuf); - - /* update metadata page with new root block number */ - _bt_metaproot(rel, rootbknum, 0); + Buffer rootbuf; + Page lpage, + rpage, + rootpage; + BlockNumber lbkno, + rbkno; + BlockNumber rootbknum; + BTPageOpaque rootopaque; + ItemId itemid; + BTItem item; + Size itemsz; + BTItem new_item; + + /* get a new root page */ + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootpage = BufferGetPage(rootbuf); + _bt_pageinit(rootpage, BufferGetPageSize(rootbuf)); + + /* set btree special data */ + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); + rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; + rootopaque->btpo_flags |= BTP_ROOT; + + /* + * Insert the internal tuple pointers. + */ + + lbkno = BufferGetBlockNumber(lbuf); + rbkno = BufferGetBlockNumber(rbuf); + lpage = BufferGetPage(lbuf); + rpage = BufferGetPage(rbuf); + + /* + * step over the high key on the left page while building the left + * page pointer. + */ + itemid = PageGetItemId(lpage, P_FIRSTKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(lpage, itemid); + new_item = _bt_formitem(&(item->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY); + + /* + * insert the left page pointer into the new root page. the root page + * is the rightmost page on its level so the "high key" item is the + * first data item. + */ + if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add leftkey to new root page"); + pfree(new_item); + + /* + * the right page is the rightmost page on the second level, so the + * "high key" item is the first data item on that page as well. + */ + itemid = PageGetItemId(rpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (BTItem) PageGetItem(rpage, itemid); + new_item = _bt_formitem(&(item->bti_itup)); + ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY); + + /* + * insert the right page pointer into the new root page. + */ + if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add rightkey to new root page"); + pfree(new_item); + + /* write and let go of the root buffer */ + rootbknum = BufferGetBlockNumber(rootbuf); + _bt_wrtbuf(rel, rootbuf); + + /* update metadata page with new root block number */ + _bt_metaproot(rel, rootbknum, 0); } /* - * _bt_pgaddtup() -- add a tuple to a particular page in the index. + * _bt_pgaddtup() -- add a tuple to a particular page in the index. * - * This routine adds the tuple to the page as requested, and keeps the - * write lock and reference associated with the page's buffer. It is - * an error to call pgaddtup() without a write lock and reference. If - * afteritem is non-null, it's the item that we expect our new item - * to follow. Otherwise, we do a binary search for the correct place - * and insert the new item there. + * This routine adds the tuple to the page as requested, and keeps the + * write lock and reference associated with the page's buffer. It is + * an error to call pgaddtup() without a write lock and reference. If + * afteritem is non-null, it's the item that we expect our new item + * to follow. Otherwise, we do a binary search for the correct place + * and insert the new item there. */ -static OffsetNumber +static OffsetNumber _bt_pgaddtup(Relation rel, - Buffer buf, - int keysz, - ScanKey itup_scankey, - Size itemsize, - BTItem btitem, - BTItem afteritem) + Buffer buf, + int keysz, + ScanKey itup_scankey, + Size itemsize, + BTItem btitem, + BTItem afteritem) { - OffsetNumber itup_off; - OffsetNumber first; - Page page; - BTPageOpaque opaque; - BTItem chkitem; - - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - if (afteritem == (BTItem) NULL) { - itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION); - } else { - itup_off = first; - - do { - chkitem = - (BTItem) PageGetItem(page, PageGetItemId(page, itup_off)); - itup_off = OffsetNumberNext(itup_off); - } while ( ! BTItemSame (chkitem, afteritem) ); - } - - if ( PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the page"); - - /* write the buffer, but hold our lock */ - _bt_wrtnorelbuf(rel, buf); - - return (itup_off); + OffsetNumber itup_off; + OffsetNumber first; + Page page; + BTPageOpaque opaque; + BTItem chkitem; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (afteritem == (BTItem) NULL) + { + itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION); + } + else + { + itup_off = first; + + do + { + chkitem = + (BTItem) PageGetItem(page, PageGetItemId(page, itup_off)); + itup_off = OffsetNumberNext(itup_off); + } while (!BTItemSame(chkitem, afteritem)); + } + + if (PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page"); + + /* write the buffer, but hold our lock */ + _bt_wrtnorelbuf(rel, buf); + + return (itup_off); } /* - * _bt_goesonpg() -- Does a new tuple belong on this page? + * _bt_goesonpg() -- Does a new tuple belong on this page? * - * This is part of the complexity introduced by allowing duplicate - * keys into the index. The tuple belongs on this page if: + * This is part of the complexity introduced by allowing duplicate + * keys into the index. The tuple belongs on this page if: * - * + there is no page to the right of this one; or - * + it is less than the high key on the page; or - * + the item it is to follow ("afteritem") appears on this - * page. + * + there is no page to the right of this one; or + * + it is less than the high key on the page; or + * + the item it is to follow ("afteritem") appears on this + * page. */ -static bool +static bool _bt_goesonpg(Relation rel, - Buffer buf, - Size keysz, - ScanKey scankey, - BTItem afteritem) + Buffer buf, + Size keysz, + ScanKey scankey, + BTItem afteritem) { - Page page; - ItemId hikey; - BTPageOpaque opaque; - BTItem chkitem; - OffsetNumber offnum, maxoff; - bool found; - - page = BufferGetPage(buf); - - /* no right neighbor? */ - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (P_RIGHTMOST(opaque)) - return (true); - - /* - * this is a non-rightmost page, so it must have a high key item. - * - * If the scan key is < the high key (the min key on the next page), - * then it for sure belongs here. - */ - hikey = PageGetItemId(page, P_HIKEY); - if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber)) - return (true); - - /* - * If the scan key is > the high key, then it for sure doesn't belong - * here. - */ - - if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber)) - return (false); - - /* - * If we have no adjacency information, and the item is equal to the - * high key on the page (by here it is), then the item does not belong - * on this page. - * - * Now it's not true in all cases. - vadim 06/10/97 - */ - - if (afteritem == (BTItem) NULL) - { - if ( opaque->btpo_flags & BTP_LEAF ) - return (false); - if ( opaque->btpo_flags & BTP_CHAIN ) - return (true); - if ( _bt_skeycmp (rel, keysz, scankey, page, - PageGetItemId(page, P_FIRSTKEY), - BTEqualStrategyNumber) ) - return (true); - return (false); - } - - /* damn, have to work for it. i hate that. */ - maxoff = PageGetMaxOffsetNumber(page); - - /* - * Search the entire page for the afteroid. We need to do this, rather - * than doing a binary search and starting from there, because if the - * key we're searching for is the leftmost key in the tree at this - * level, then a binary search will do the wrong thing. Splits are - * pretty infrequent, so the cost isn't as bad as it could be. - */ - - found = false; - for (offnum = P_FIRSTKEY; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) { - chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - - if ( BTItemSame (chkitem, afteritem) ) { - found = true; - break; + Page page; + ItemId hikey; + BTPageOpaque opaque; + BTItem chkitem; + OffsetNumber offnum, + maxoff; + bool found; + + page = BufferGetPage(buf); + + /* no right neighbor? */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_RIGHTMOST(opaque)) + return (true); + + /* + * this is a non-rightmost page, so it must have a high key item. + * + * If the scan key is < the high key (the min key on the next page), then + * it for sure belongs here. + */ + hikey = PageGetItemId(page, P_HIKEY); + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber)) + return (true); + + /* + * If the scan key is > the high key, then it for sure doesn't belong + * here. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber)) + return (false); + + /* + * If we have no adjacency information, and the item is equal to the + * high key on the page (by here it is), then the item does not belong + * on this page. + * + * Now it's not true in all cases. - vadim 06/10/97 + */ + + if (afteritem == (BTItem) NULL) + { + if (opaque->btpo_flags & BTP_LEAF) + return (false); + if (opaque->btpo_flags & BTP_CHAIN) + return (true); + if (_bt_skeycmp(rel, keysz, scankey, page, + PageGetItemId(page, P_FIRSTKEY), + BTEqualStrategyNumber)) + return (true); + return (false); + } + + /* damn, have to work for it. i hate that. */ + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Search the entire page for the afteroid. We need to do this, + * rather than doing a binary search and starting from there, because + * if the key we're searching for is the leftmost key in the tree at + * this level, then a binary search will do the wrong thing. Splits + * are pretty infrequent, so the cost isn't as bad as it could be. + */ + + found = false; + for (offnum = P_FIRSTKEY; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + + if (BTItemSame(chkitem, afteritem)) + { + found = true; + break; + } } - } - - return (found); + + return (found); } /* - * _bt_itemcmp() -- compare item1 to item2 using a requested - * strategy (<, <=, =, >=, >) + * _bt_itemcmp() -- compare item1 to item2 using a requested + * strategy (<, <=, =, >=, >) * */ bool _bt_itemcmp(Relation rel, - Size keysz, - BTItem item1, - BTItem item2, - StrategyNumber strat) + Size keysz, + BTItem item1, + BTItem item2, + StrategyNumber strat) { - TupleDesc tupDes; - IndexTuple indexTuple1, indexTuple2; - Datum attrDatum1, attrDatum2; - int i; - bool isFirstNull, isSecondNull; - bool compare; - bool useEqual = false; - - if ( strat == BTLessEqualStrategyNumber ) - { - useEqual = true; - strat = BTLessStrategyNumber; - } - else if ( strat == BTGreaterEqualStrategyNumber ) - { - useEqual = true; - strat = BTGreaterStrategyNumber; - } - - tupDes = RelationGetTupleDescriptor(rel); - indexTuple1 = &(item1->bti_itup); - indexTuple2 = &(item2->bti_itup); - - for (i = 1; i <= keysz; i++) { - attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isFirstNull); - attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isSecondNull); - - /* see comments about NULLs handling in btbuild */ - if ( isFirstNull ) /* attr in item1 is NULL */ + TupleDesc tupDes; + IndexTuple indexTuple1, + indexTuple2; + Datum attrDatum1, + attrDatum2; + int i; + bool isFirstNull, + isSecondNull; + bool compare; + bool useEqual = false; + + if (strat == BTLessEqualStrategyNumber) { - if ( isSecondNull ) /* attr in item2 is NULL too */ - compare = ( strat == BTEqualStrategyNumber ) ? true : false; - else - compare = ( strat == BTGreaterStrategyNumber ) ? true : false; - } - else if ( isSecondNull ) /* attr in item1 is NOT_NULL and */ - { /* and attr in item2 is NULL */ - compare = ( strat == BTLessStrategyNumber ) ? true : false; - } - else - { - compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2); + useEqual = true; + strat = BTLessStrategyNumber; } - - if ( compare ) /* true for one of ">, <, =" */ + else if (strat == BTGreaterEqualStrategyNumber) { - if ( strat != BTEqualStrategyNumber ) - return (true); + useEqual = true; + strat = BTGreaterStrategyNumber; } - else /* false for one of ">, <, =" */ + + tupDes = RelationGetTupleDescriptor(rel); + indexTuple1 = &(item1->bti_itup); + indexTuple2 = &(item2->bti_itup); + + for (i = 1; i <= keysz; i++) { - if ( strat == BTEqualStrategyNumber ) - return (false); - /* - * if original strat was "<=, >=" OR - * "<, >" but some attribute(s) left - * - need to test for Equality - */ - if ( useEqual || i < keysz ) - { - if ( isFirstNull || isSecondNull ) - compare = ( isFirstNull && isSecondNull ) ? true : false; - else - compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber, - attrDatum1, attrDatum2); - if ( compare ) /* item1' and item2' attributes are equal */ - continue; /* - try to compare next attributes */ - } - return (false); + attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isFirstNull); + attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isSecondNull); + + /* see comments about NULLs handling in btbuild */ + if (isFirstNull) /* attr in item1 is NULL */ + { + if (isSecondNull) /* attr in item2 is NULL too */ + compare = (strat == BTEqualStrategyNumber) ? true : false; + else + compare = (strat == BTGreaterStrategyNumber) ? true : false; + } + else if (isSecondNull) /* attr in item1 is NOT_NULL and */ + { /* and attr in item2 is NULL */ + compare = (strat == BTLessStrategyNumber) ? true : false; + } + else + { + compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2); + } + + if (compare) /* true for one of ">, <, =" */ + { + if (strat != BTEqualStrategyNumber) + return (true); + } + else +/* false for one of ">, <, =" */ + { + if (strat == BTEqualStrategyNumber) + return (false); + + /* + * if original strat was "<=, >=" OR "<, >" but some + * attribute(s) left - need to test for Equality + */ + if (useEqual || i < keysz) + { + if (isFirstNull || isSecondNull) + compare = (isFirstNull && isSecondNull) ? true : false; + else + compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber, + attrDatum1, attrDatum2); + if (compare) /* item1' and item2' attributes are equal */ + continue; /* - try to compare next attributes */ + } + return (false); + } } - } - return (true); + return (true); } /* - * _bt_updateitem() -- updates the key of the item identified by the - * oid with the key of newItem (done in place if - * possible) + * _bt_updateitem() -- updates the key of the item identified by the + * oid with the key of newItem (done in place if + * possible) * */ static void _bt_updateitem(Relation rel, - Size keysz, - Buffer buf, - BTItem oldItem, - BTItem newItem) + Size keysz, + Buffer buf, + BTItem oldItem, + BTItem newItem) { - Page page; - OffsetNumber maxoff; - OffsetNumber i; - ItemPointerData itemPtrData; - BTItem item; - IndexTuple oldIndexTuple, newIndexTuple; - int first; - - page = BufferGetPage(buf); - maxoff = PageGetMaxOffsetNumber(page); - - /* locate item on the page */ - first = P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page)) - ? P_HIKEY : P_FIRSTKEY; - i = first; - do { - item = (BTItem) PageGetItem(page, PageGetItemId(page, i)); - i = OffsetNumberNext(i); - } while (i <= maxoff && ! BTItemSame (item, oldItem)); - - /* this should never happen (in theory) */ - if ( ! BTItemSame (item, oldItem) ) { - elog(FATAL, "_bt_getstackbuf was lying!!"); - } - - /* - * It's defined by caller (_bt_insertonpg) - */ - /* - if(IndexTupleDSize(newItem->bti_itup) > - IndexTupleDSize(item->bti_itup)) { - elog(NOTICE, "trying to overwrite a smaller value with a bigger one in _bt_updateitem"); - elog(WARN, "this is not good."); - } - */ - - oldIndexTuple = &(item->bti_itup); - newIndexTuple = &(newItem->bti_itup); + Page page; + OffsetNumber maxoff; + OffsetNumber i; + ItemPointerData itemPtrData; + BTItem item; + IndexTuple oldIndexTuple, + newIndexTuple; + int first; + + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + + /* locate item on the page */ + first = P_RIGHTMOST((BTPageOpaque) PageGetSpecialPointer(page)) + ? P_HIKEY : P_FIRSTKEY; + i = first; + do + { + item = (BTItem) PageGetItem(page, PageGetItemId(page, i)); + i = OffsetNumberNext(i); + } while (i <= maxoff && !BTItemSame(item, oldItem)); + + /* this should never happen (in theory) */ + if (!BTItemSame(item, oldItem)) + { + elog(FATAL, "_bt_getstackbuf was lying!!"); + } + + /* + * It's defined by caller (_bt_insertonpg) + */ + + /* + * if(IndexTupleDSize(newItem->bti_itup) > + * IndexTupleDSize(item->bti_itup)) { elog(NOTICE, "trying to + * overwrite a smaller value with a bigger one in _bt_updateitem"); + * elog(WARN, "this is not good."); } + */ + + oldIndexTuple = &(item->bti_itup); + newIndexTuple = &(newItem->bti_itup); /* keep the original item pointer */ - ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData); - CopyIndexTuple(newIndexTuple, &oldIndexTuple); - ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid)); - + ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData); + CopyIndexTuple(newIndexTuple, &oldIndexTuple); + ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid)); + } /* @@ -1409,177 +1460,179 @@ _bt_updateitem(Relation rel, * * Rule is simple: NOT_NULL not equal NULL, NULL not_equal NULL too. */ -static bool -_bt_isequal (TupleDesc itupdesc, Page page, OffsetNumber offnum, - int keysz, ScanKey scankey) +static bool +_bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, + int keysz, ScanKey scankey) { - Datum datum; - BTItem btitem; - IndexTuple itup; - ScanKey entry; - AttrNumber attno; - long result; - int i; - bool null; - - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &(btitem->bti_itup); - - for (i = 1; i <= keysz; i++) - { - entry = &scankey[i - 1]; - attno = entry->sk_attno; - Assert (attno == i); - datum = index_getattr(itup, attno, itupdesc, &null); - - /* NULLs are not equal */ - if ( entry->sk_flags & SK_ISNULL || null ) - return (false); - - result = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, - entry->sk_argument, datum); - if (result != 0) - return (false); - } - - /* by here, the keys are equal */ - return (true); + Datum datum; + BTItem btitem; + IndexTuple itup; + ScanKey entry; + AttrNumber attno; + long result; + int i; + bool null; + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + + for (i = 1; i <= keysz; i++) + { + entry = &scankey[i - 1]; + attno = entry->sk_attno; + Assert(attno == i); + datum = index_getattr(itup, attno, itupdesc, &null); + + /* NULLs are not equal */ + if (entry->sk_flags & SK_ISNULL || null) + return (false); + + result = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, + entry->sk_argument, datum); + if (result != 0) + return (false); + } + + /* by here, the keys are equal */ + return (true); } #ifdef NOT_USED /* - * _bt_shift - insert btitem on the passed page after shifting page - * to the right in the tree. + * _bt_shift - insert btitem on the passed page after shifting page + * to the right in the tree. * * NOTE: tested for shifting leftmost page only, having btitem < hikey. */ -static InsertIndexResult -_bt_shift (Relation rel, Buffer buf, BTStack stack, int keysz, - ScanKey scankey, BTItem btitem, BTItem hikey) +static InsertIndexResult +_bt_shift(Relation rel, Buffer buf, BTStack stack, int keysz, + ScanKey scankey, BTItem btitem, BTItem hikey) { - InsertIndexResult res; - int itemsz; - Page page; - BlockNumber bknum; - BTPageOpaque pageop; - Buffer rbuf; - Page rpage; - BTPageOpaque rpageop; - Buffer pbuf; - Page ppage; - BTPageOpaque ppageop; - Buffer nbuf; - Page npage; - BTPageOpaque npageop; - BlockNumber nbknum; - BTItem nitem; - OffsetNumber afteroff; - - btitem = _bt_formitem(&(btitem->bti_itup)); - hikey = _bt_formitem(&(hikey->bti_itup)); - - page = BufferGetPage(buf); - - /* grab new page */ - nbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); - nbknum = BufferGetBlockNumber(nbuf); - npage = BufferGetPage(nbuf); - _bt_pageinit(npage, BufferGetPageSize(nbuf)); - npageop = (BTPageOpaque) PageGetSpecialPointer(npage); - - /* copy content of the passed page */ - memmove ((char *) npage, (char *) page, BufferGetPageSize(buf)); - - /* re-init old (passed) page */ - _bt_pageinit(page, BufferGetPageSize(buf)); - pageop = (BTPageOpaque) PageGetSpecialPointer(page); - - /* init old page opaque */ - pageop->btpo_flags = npageop->btpo_flags; /* restore flags */ - pageop->btpo_flags &= ~BTP_CHAIN; - if ( _bt_itemcmp (rel, keysz, hikey, btitem, BTEqualStrategyNumber) ) - pageop->btpo_flags |= BTP_CHAIN; - pageop->btpo_prev = npageop->btpo_prev; /* restore prev */ - pageop->btpo_next = nbknum; /* next points to the new page */ - - /* init shifted page opaque */ - npageop->btpo_prev = bknum = BufferGetBlockNumber(buf); - - /* shifted page is ok, populate old page */ - - /* add passed hikey */ - itemsz = IndexTupleDSize(hikey->bti_itup) - + (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = DOUBLEALIGN(itemsz); - if ( PageAddItem(page, (Item) hikey, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add hikey in _bt_shift"); - pfree (hikey); - - /* add btitem */ - itemsz = IndexTupleDSize(btitem->bti_itup) - + (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = DOUBLEALIGN(itemsz); - if ( PageAddItem(page, (Item) btitem, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add firstkey in _bt_shift"); - pfree (btitem); - nitem = (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)); - btitem = _bt_formitem(&(nitem->bti_itup)); - ItemPointerSet(&(btitem->bti_itup.t_tid), bknum, P_HIKEY); - - /* ok, write them out */ - _bt_wrtnorelbuf(rel, nbuf); - _bt_wrtnorelbuf(rel, buf); - - /* fix btpo_prev on right sibling of old page */ - if ( !P_RIGHTMOST (npageop) ) - { - rbuf = _bt_getbuf(rel, npageop->btpo_next, BT_WRITE); - rpage = BufferGetPage(rbuf); - rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); - rpageop->btpo_prev = nbknum; - _bt_wrtbuf(rel, rbuf); - } - - /* get parent pointing to the old page */ - ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), - bknum, P_HIKEY); - pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); - ppage = BufferGetPage(pbuf); - ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); - - _bt_relbuf(rel, nbuf, BT_WRITE); - _bt_relbuf(rel, buf, BT_WRITE); - - /* re-set parent' pointer - we shifted our page to the right ! */ - nitem = (BTItem) PageGetItem (ppage, - PageGetItemId (ppage, stack->bts_offset)); - ItemPointerSet(&(nitem->bti_itup.t_tid), nbknum, P_HIKEY); - ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), nbknum, P_HIKEY); - _bt_wrtnorelbuf(rel, pbuf); - - /* - * Now we want insert into the parent pointer to our old page. It has to - * be inserted before the pointer to new page. You may get problems here - * (in the _bt_goesonpg and/or _bt_pgaddtup), but may be not - I don't - * know. It works if old page is leftmost (nitem is NULL) and - * btitem < hikey and it's all what we need currently. - vadim 05/30/97 - */ - nitem = NULL; - afteroff = P_FIRSTKEY; - if ( !P_RIGHTMOST (ppageop) ) - afteroff = OffsetNumberNext (afteroff); - if ( stack->bts_offset >= afteroff ) - { - afteroff = OffsetNumberPrev (stack->bts_offset); - nitem = (BTItem) PageGetItem (ppage, PageGetItemId (ppage, afteroff)); - nitem = _bt_formitem(&(nitem->bti_itup)); - } - res = _bt_insertonpg(rel, pbuf, stack->bts_parent, - keysz, scankey, btitem, nitem); - pfree (btitem); - - ItemPointerSet(&(res->pointerData), nbknum, P_HIKEY); - - return (res); + InsertIndexResult res; + int itemsz; + Page page; + BlockNumber bknum; + BTPageOpaque pageop; + Buffer rbuf; + Page rpage; + BTPageOpaque rpageop; + Buffer pbuf; + Page ppage; + BTPageOpaque ppageop; + Buffer nbuf; + Page npage; + BTPageOpaque npageop; + BlockNumber nbknum; + BTItem nitem; + OffsetNumber afteroff; + + btitem = _bt_formitem(&(btitem->bti_itup)); + hikey = _bt_formitem(&(hikey->bti_itup)); + + page = BufferGetPage(buf); + + /* grab new page */ + nbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + nbknum = BufferGetBlockNumber(nbuf); + npage = BufferGetPage(nbuf); + _bt_pageinit(npage, BufferGetPageSize(nbuf)); + npageop = (BTPageOpaque) PageGetSpecialPointer(npage); + + /* copy content of the passed page */ + memmove((char *) npage, (char *) page, BufferGetPageSize(buf)); + + /* re-init old (passed) page */ + _bt_pageinit(page, BufferGetPageSize(buf)); + pageop = (BTPageOpaque) PageGetSpecialPointer(page); + + /* init old page opaque */ + pageop->btpo_flags = npageop->btpo_flags; /* restore flags */ + pageop->btpo_flags &= ~BTP_CHAIN; + if (_bt_itemcmp(rel, keysz, hikey, btitem, BTEqualStrategyNumber)) + pageop->btpo_flags |= BTP_CHAIN; + pageop->btpo_prev = npageop->btpo_prev; /* restore prev */ + pageop->btpo_next = nbknum; /* next points to the new page */ + + /* init shifted page opaque */ + npageop->btpo_prev = bknum = BufferGetBlockNumber(buf); + + /* shifted page is ok, populate old page */ + + /* add passed hikey */ + itemsz = IndexTupleDSize(hikey->bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + itemsz = DOUBLEALIGN(itemsz); + if (PageAddItem(page, (Item) hikey, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add hikey in _bt_shift"); + pfree(hikey); + + /* add btitem */ + itemsz = IndexTupleDSize(btitem->bti_itup) + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + itemsz = DOUBLEALIGN(itemsz); + if (PageAddItem(page, (Item) btitem, itemsz, P_FIRSTKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add firstkey in _bt_shift"); + pfree(btitem); + nitem = (BTItem) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY)); + btitem = _bt_formitem(&(nitem->bti_itup)); + ItemPointerSet(&(btitem->bti_itup.t_tid), bknum, P_HIKEY); + + /* ok, write them out */ + _bt_wrtnorelbuf(rel, nbuf); + _bt_wrtnorelbuf(rel, buf); + + /* fix btpo_prev on right sibling of old page */ + if (!P_RIGHTMOST(npageop)) + { + rbuf = _bt_getbuf(rel, npageop->btpo_next, BT_WRITE); + rpage = BufferGetPage(rbuf); + rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage); + rpageop->btpo_prev = nbknum; + _bt_wrtbuf(rel, rbuf); + } + + /* get parent pointing to the old page */ + ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), + bknum, P_HIKEY); + pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); + ppage = BufferGetPage(pbuf); + ppageop = (BTPageOpaque) PageGetSpecialPointer(ppage); + + _bt_relbuf(rel, nbuf, BT_WRITE); + _bt_relbuf(rel, buf, BT_WRITE); + + /* re-set parent' pointer - we shifted our page to the right ! */ + nitem = (BTItem) PageGetItem(ppage, + PageGetItemId(ppage, stack->bts_offset)); + ItemPointerSet(&(nitem->bti_itup.t_tid), nbknum, P_HIKEY); + ItemPointerSet(&(stack->bts_btitem->bti_itup.t_tid), nbknum, P_HIKEY); + _bt_wrtnorelbuf(rel, pbuf); + + /* + * Now we want insert into the parent pointer to our old page. It has + * to be inserted before the pointer to new page. You may get problems + * here (in the _bt_goesonpg and/or _bt_pgaddtup), but may be not - I + * don't know. It works if old page is leftmost (nitem is NULL) and + * btitem < hikey and it's all what we need currently. - vadim + * 05/30/97 + */ + nitem = NULL; + afteroff = P_FIRSTKEY; + if (!P_RIGHTMOST(ppageop)) + afteroff = OffsetNumberNext(afteroff); + if (stack->bts_offset >= afteroff) + { + afteroff = OffsetNumberPrev(stack->bts_offset); + nitem = (BTItem) PageGetItem(ppage, PageGetItemId(ppage, afteroff)); + nitem = _bt_formitem(&(nitem->bti_itup)); + } + res = _bt_insertonpg(rel, pbuf, stack->bts_parent, + keysz, scankey, btitem, nitem); + pfree(btitem); + + ItemPointerSet(&(res->pointerData), nbknum, P_HIKEY); + + return (res); } + #endif diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 9142c557378..6551af4c17c 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1,21 +1,21 @@ /*------------------------------------------------------------------------- * * nbtpage.c-- - * BTree-specific page management code for the Postgres btree access - * method. + * BTree-specific page management code for the Postgres btree access + * method. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.9 1997/08/19 21:29:36 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.10 1997/09/07 04:38:52 momjian Exp $ * - * NOTES - * Postgres btree pages look like ordinary relation pages. The opaque - * data at high addresses includes pointers to left and right siblings - * and flag data describing page state. The first page in a btree, page - * zero, is special -- it stores meta-information describing the tree. - * Pages one and higher store the actual tree data. + * NOTES + * Postgres btree pages look like ordinary relation pages. The opaque + * data at high addresses includes pointers to left and right siblings + * and flag data describing page state. The first page in a btree, page + * zero, is special -- it stores meta-information describing the tree. + * Pages one and higher store the actual tree data. * *------------------------------------------------------------------------- */ @@ -31,16 +31,16 @@ #include <storage/lmgr.h> #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif -static void _bt_setpagelock(Relation rel, BlockNumber blkno, int access); -static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access); +static void _bt_setpagelock(Relation rel, BlockNumber blkno, int access); +static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access); #define BTREE_METAPAGE 0 -#define BTREE_MAGIC 0x053162 +#define BTREE_MAGIC 0x053162 #ifdef BTREE_VERSION_1 #define BTREE_VERSION 1 @@ -48,546 +48,574 @@ static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access); #define BTREE_VERSION 0 #endif -typedef struct BTMetaPageData { - uint32 btm_magic; - uint32 btm_version; - BlockNumber btm_root; +typedef struct BTMetaPageData +{ + uint32 btm_magic; + uint32 btm_version; + BlockNumber btm_root; #ifdef BTREE_VERSION_1 - int32 btm_level; + int32 btm_level; #endif -} BTMetaPageData; +} BTMetaPageData; -#define BTPageGetMeta(p) \ - ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0]) +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) &((PageHeader) p)->pd_linp[0]) -extern bool BuildingBtree; +extern bool BuildingBtree; /* - * We use high-concurrency locking on btrees. There are two cases in - * which we don't do locking. One is when we're building the btree. - * Since the creating transaction has not committed, no one can see - * the index, and there's no reason to share locks. The second case - * is when we're just starting up the database system. We use some - * special-purpose initialization code in the relation cache manager - * (see utils/cache/relcache.c) to allow us to do indexed scans on - * the system catalogs before we'd normally be able to. This happens - * before the lock table is fully initialized, so we can't use it. - * Strictly speaking, this violates 2pl, but we don't do 2pl on the - * system catalogs anyway, so I declare this to be okay. + * We use high-concurrency locking on btrees. There are two cases in + * which we don't do locking. One is when we're building the btree. + * Since the creating transaction has not committed, no one can see + * the index, and there's no reason to share locks. The second case + * is when we're just starting up the database system. We use some + * special-purpose initialization code in the relation cache manager + * (see utils/cache/relcache.c) to allow us to do indexed scans on + * the system catalogs before we'd normally be able to. This happens + * before the lock table is fully initialized, so we can't use it. + * Strictly speaking, this violates 2pl, but we don't do 2pl on the + * system catalogs anyway, so I declare this to be okay. */ -#define USELOCKING (!BuildingBtree && !IsInitProcessingMode()) +#define USELOCKING (!BuildingBtree && !IsInitProcessingMode()) /* - * _bt_metapinit() -- Initialize the metadata page of a btree. + * _bt_metapinit() -- Initialize the metadata page of a btree. */ void _bt_metapinit(Relation rel) { - Buffer buf; - Page pg; - int nblocks; - BTMetaPageData metad; - BTPageOpaque op; - - /* can't be sharing this with anyone, now... */ - if (USELOCKING) - RelationSetLockForWrite(rel); - - if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) { - elog(WARN, "Cannot initialize non-empty btree %s", - RelationGetRelationName(rel)); - } - - buf = ReadBuffer(rel, P_NEW); - pg = BufferGetPage(buf); - _bt_pageinit(pg, BufferGetPageSize(buf)); - - metad.btm_magic = BTREE_MAGIC; - metad.btm_version = BTREE_VERSION; - metad.btm_root = P_NONE; + Buffer buf; + Page pg; + int nblocks; + BTMetaPageData metad; + BTPageOpaque op; + + /* can't be sharing this with anyone, now... */ + if (USELOCKING) + RelationSetLockForWrite(rel); + + if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) + { + elog(WARN, "Cannot initialize non-empty btree %s", + RelationGetRelationName(rel)); + } + + buf = ReadBuffer(rel, P_NEW); + pg = BufferGetPage(buf); + _bt_pageinit(pg, BufferGetPageSize(buf)); + + metad.btm_magic = BTREE_MAGIC; + metad.btm_version = BTREE_VERSION; + metad.btm_root = P_NONE; #ifdef BTREE_VERSION_1 - metad.btm_level = 0; + metad.btm_level = 0; #endif - memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); - - op = (BTPageOpaque) PageGetSpecialPointer(pg); - op->btpo_flags = BTP_META; - - WriteBuffer(buf); - - /* all done */ - if (USELOCKING) - RelationUnsetLockForWrite(rel); + memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad)); + + op = (BTPageOpaque) PageGetSpecialPointer(pg); + op->btpo_flags = BTP_META; + + WriteBuffer(buf); + + /* all done */ + if (USELOCKING) + RelationUnsetLockForWrite(rel); } #ifdef NOT_USED /* - * _bt_checkmeta() -- Verify that the metadata stored in a btree are - * reasonable. + * _bt_checkmeta() -- Verify that the metadata stored in a btree are + * reasonable. */ void _bt_checkmeta(Relation rel) { - Buffer metabuf; - Page metap; - BTMetaPageData *metad; - BTPageOpaque op; - int nblocks; - - /* if the relation is empty, this is init time; don't complain */ - if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0) - return; - - metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); - metap = BufferGetPage(metabuf); - op = (BTPageOpaque) PageGetSpecialPointer(metap); - if (!(op->btpo_flags & BTP_META)) { - elog(WARN, "Invalid metapage for index %s", - RelationGetRelationName(rel)); - } - metad = BTPageGetMeta(metap); - - if (metad->btm_magic != BTREE_MAGIC) { - elog(WARN, "Index %s is not a btree", - RelationGetRelationName(rel)); - } - - if (metad->btm_version != BTREE_VERSION) { - elog(WARN, "Version mismatch on %s: version %d file, version %d code", - RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION); - } - - _bt_relbuf(rel, metabuf, BT_READ); + Buffer metabuf; + Page metap; + BTMetaPageData *metad; + BTPageOpaque op; + int nblocks; + + /* if the relation is empty, this is init time; don't complain */ + if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0) + return; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metap = BufferGetPage(metabuf); + op = (BTPageOpaque) PageGetSpecialPointer(metap); + if (!(op->btpo_flags & BTP_META)) + { + elog(WARN, "Invalid metapage for index %s", + RelationGetRelationName(rel)); + } + metad = BTPageGetMeta(metap); + + if (metad->btm_magic != BTREE_MAGIC) + { + elog(WARN, "Index %s is not a btree", + RelationGetRelationName(rel)); + } + + if (metad->btm_version != BTREE_VERSION) + { + elog(WARN, "Version mismatch on %s: version %d file, version %d code", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION); + } + + _bt_relbuf(rel, metabuf, BT_READ); } + #endif /* - * _bt_getroot() -- Get the root page of the btree. + * _bt_getroot() -- Get the root page of the btree. * - * Since the root page can move around the btree file, we have to read - * its location from the metadata page, and then read the root page - * itself. If no root page exists yet, we have to create one. The - * standard class of race conditions exists here; I think I covered - * them all in the Hopi Indian rain dance of lock requests below. + * Since the root page can move around the btree file, we have to read + * its location from the metadata page, and then read the root page + * itself. If no root page exists yet, we have to create one. The + * standard class of race conditions exists here; I think I covered + * them all in the Hopi Indian rain dance of lock requests below. * - * We pass in the access type (BT_READ or BT_WRITE), and return the - * root page's buffer with the appropriate lock type set. Reference - * count on the root page gets bumped by ReadBuffer. The metadata - * page is unlocked and unreferenced by this process when this routine - * returns. + * We pass in the access type (BT_READ or BT_WRITE), and return the + * root page's buffer with the appropriate lock type set. Reference + * count on the root page gets bumped by ReadBuffer. The metadata + * page is unlocked and unreferenced by this process when this routine + * returns. */ Buffer _bt_getroot(Relation rel, int access) { - Buffer metabuf; - Page metapg; - BTPageOpaque metaopaque; - Buffer rootbuf; - Page rootpg; - BTPageOpaque rootopaque; - BlockNumber rootblkno; - BTMetaPageData *metad; - - metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); - metapg = BufferGetPage(metabuf); - metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); - Assert(metaopaque->btpo_flags & BTP_META); - metad = BTPageGetMeta(metapg); - - if (metad->btm_magic != BTREE_MAGIC) { - elog(WARN, "Index %s is not a btree", - RelationGetRelationName(rel)); - } - - if (metad->btm_version != BTREE_VERSION) { - elog(WARN, "Version mismatch on %s: version %d file, version %d code", - RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION); - } - - /* if no root page initialized yet, do it */ - if (metad->btm_root == P_NONE) { - - /* turn our read lock in for a write lock */ - _bt_relbuf(rel, metabuf, BT_READ); - metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + Buffer rootbuf; + Page rootpg; + BTPageOpaque rootopaque; + BlockNumber rootblkno; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); metapg = BufferGetPage(metabuf); metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); Assert(metaopaque->btpo_flags & BTP_META); metad = BTPageGetMeta(metapg); - - /* - * Race condition: if someone else initialized the metadata between - * the time we released the read lock and acquired the write lock, - * above, we want to avoid doing it again. - */ - - if (metad->btm_root == P_NONE) { - - /* - * Get, initialize, write, and leave a lock of the appropriate - * type on the new root page. Since this is the first page in - * the tree, it's a leaf. - */ - - rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); - rootblkno = BufferGetBlockNumber(rootbuf); - rootpg = BufferGetPage(rootbuf); - metad->btm_root = rootblkno; + + if (metad->btm_magic != BTREE_MAGIC) + { + elog(WARN, "Index %s is not a btree", + RelationGetRelationName(rel)); + } + + if (metad->btm_version != BTREE_VERSION) + { + elog(WARN, "Version mismatch on %s: version %d file, version %d code", + RelationGetRelationName(rel), + metad->btm_version, BTREE_VERSION); + } + + /* if no root page initialized yet, do it */ + if (metad->btm_root == P_NONE) + { + + /* turn our read lock in for a write lock */ + _bt_relbuf(rel, metabuf, BT_READ); + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metapg); + + /* + * Race condition: if someone else initialized the metadata + * between the time we released the read lock and acquired the + * write lock, above, we want to avoid doing it again. + */ + + if (metad->btm_root == P_NONE) + { + + /* + * Get, initialize, write, and leave a lock of the appropriate + * type on the new root page. Since this is the first page in + * the tree, it's a leaf. + */ + + rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rootblkno = BufferGetBlockNumber(rootbuf); + rootpg = BufferGetPage(rootbuf); + metad->btm_root = rootblkno; #ifdef BTREE_VERSION_1 - metad->btm_level = 1; + metad->btm_level = 1; #endif - _bt_pageinit(rootpg, BufferGetPageSize(rootbuf)); - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); - rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT); - _bt_wrtnorelbuf(rel, rootbuf); - - /* swap write lock for read lock, if appropriate */ - if (access != BT_WRITE) { - _bt_setpagelock(rel, rootblkno, BT_READ); - _bt_unsetpagelock(rel, rootblkno, BT_WRITE); - } - - /* okay, metadata is correct */ - _bt_wrtbuf(rel, metabuf); - } else { - - /* - * Metadata initialized by someone else. In order to guarantee - * no deadlocks, we have to release the metadata page and start - * all over again. - */ - - _bt_relbuf(rel, metabuf, BT_WRITE); - return (_bt_getroot(rel, access)); + _bt_pageinit(rootpg, BufferGetPageSize(rootbuf)); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT); + _bt_wrtnorelbuf(rel, rootbuf); + + /* swap write lock for read lock, if appropriate */ + if (access != BT_WRITE) + { + _bt_setpagelock(rel, rootblkno, BT_READ); + _bt_unsetpagelock(rel, rootblkno, BT_WRITE); + } + + /* okay, metadata is correct */ + _bt_wrtbuf(rel, metabuf); + } + else + { + + /* + * Metadata initialized by someone else. In order to + * guarantee no deadlocks, we have to release the metadata + * page and start all over again. + */ + + _bt_relbuf(rel, metabuf, BT_WRITE); + return (_bt_getroot(rel, access)); + } } - } else { - rootbuf = _bt_getbuf(rel, metad->btm_root, access); - - /* done with the meta page */ - _bt_relbuf(rel, metabuf, BT_READ); - } - - /* - * Race condition: If the root page split between the time we looked - * at the metadata page and got the root buffer, then we got the wrong - * buffer. - */ - - rootpg = BufferGetPage(rootbuf); - rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); - if (!(rootopaque->btpo_flags & BTP_ROOT)) { - - /* it happened, try again */ - _bt_relbuf(rel, rootbuf, access); - return (_bt_getroot(rel, access)); - } - - /* - * By here, we have a correct lock on the root block, its reference - * count is correct, and we have no lock set on the metadata page. - * Return the root block. - */ - - return (rootbuf); + else + { + rootbuf = _bt_getbuf(rel, metad->btm_root, access); + + /* done with the meta page */ + _bt_relbuf(rel, metabuf, BT_READ); + } + + /* + * Race condition: If the root page split between the time we looked + * at the metadata page and got the root buffer, then we got the wrong + * buffer. + */ + + rootpg = BufferGetPage(rootbuf); + rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg); + if (!(rootopaque->btpo_flags & BTP_ROOT)) + { + + /* it happened, try again */ + _bt_relbuf(rel, rootbuf, access); + return (_bt_getroot(rel, access)); + } + + /* + * By here, we have a correct lock on the root block, its reference + * count is correct, and we have no lock set on the metadata page. + * Return the root block. + */ + + return (rootbuf); } /* - * _bt_getbuf() -- Get a buffer by block number for read or write. + * _bt_getbuf() -- Get a buffer by block number for read or write. * - * When this routine returns, the appropriate lock is set on the - * requested buffer its reference count is correct. + * When this routine returns, the appropriate lock is set on the + * requested buffer its reference count is correct. */ Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access) { - Buffer buf; - Page page; - - /* - * If we want a new block, we can't set a lock of the appropriate type - * until we've instantiated the buffer. - */ - - if (blkno != P_NEW) { - if (access == BT_WRITE) - _bt_setpagelock(rel, blkno, BT_WRITE); - else - _bt_setpagelock(rel, blkno, BT_READ); - - buf = ReadBuffer(rel, blkno); - } else { - buf = ReadBuffer(rel, blkno); - blkno = BufferGetBlockNumber(buf); - page = BufferGetPage(buf); - _bt_pageinit(page, BufferGetPageSize(buf)); - - if (access == BT_WRITE) - _bt_setpagelock(rel, blkno, BT_WRITE); + Buffer buf; + Page page; + + /* + * If we want a new block, we can't set a lock of the appropriate type + * until we've instantiated the buffer. + */ + + if (blkno != P_NEW) + { + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + + buf = ReadBuffer(rel, blkno); + } else - _bt_setpagelock(rel, blkno, BT_READ); - } - - /* ref count and lock type are correct */ - return (buf); + { + buf = ReadBuffer(rel, blkno); + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); + _bt_pageinit(page, BufferGetPageSize(buf)); + + if (access == BT_WRITE) + _bt_setpagelock(rel, blkno, BT_WRITE); + else + _bt_setpagelock(rel, blkno, BT_READ); + } + + /* ref count and lock type are correct */ + return (buf); } /* - * _bt_relbuf() -- release a locked buffer. + * _bt_relbuf() -- release a locked buffer. */ void _bt_relbuf(Relation rel, Buffer buf, int access) { - BlockNumber blkno; - - blkno = BufferGetBlockNumber(buf); - - /* access had better be one of read or write */ - if (access == BT_WRITE) - _bt_unsetpagelock(rel, blkno, BT_WRITE); - else - _bt_unsetpagelock(rel, blkno, BT_READ); - - ReleaseBuffer(buf); + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + + /* access had better be one of read or write */ + if (access == BT_WRITE) + _bt_unsetpagelock(rel, blkno, BT_WRITE); + else + _bt_unsetpagelock(rel, blkno, BT_READ); + + ReleaseBuffer(buf); } /* - * _bt_wrtbuf() -- write a btree page to disk. + * _bt_wrtbuf() -- write a btree page to disk. * - * This routine releases the lock held on the buffer and our reference - * to it. It is an error to call _bt_wrtbuf() without a write lock - * or a reference to the buffer. + * This routine releases the lock held on the buffer and our reference + * to it. It is an error to call _bt_wrtbuf() without a write lock + * or a reference to the buffer. */ void _bt_wrtbuf(Relation rel, Buffer buf) { - BlockNumber blkno; - - blkno = BufferGetBlockNumber(buf); - WriteBuffer(buf); - _bt_unsetpagelock(rel, blkno, BT_WRITE); + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteBuffer(buf); + _bt_unsetpagelock(rel, blkno, BT_WRITE); } /* - * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release - * our reference or lock. + * _bt_wrtnorelbuf() -- write a btree page to disk, but do not release + * our reference or lock. * - * It is an error to call _bt_wrtnorelbuf() without a write lock - * or a reference to the buffer. + * It is an error to call _bt_wrtnorelbuf() without a write lock + * or a reference to the buffer. */ void _bt_wrtnorelbuf(Relation rel, Buffer buf) { - BlockNumber blkno; - - blkno = BufferGetBlockNumber(buf); - WriteNoReleaseBuffer(buf); + BlockNumber blkno; + + blkno = BufferGetBlockNumber(buf); + WriteNoReleaseBuffer(buf); } /* - * _bt_pageinit() -- Initialize a new page. + * _bt_pageinit() -- Initialize a new page. */ void _bt_pageinit(Page page, Size size) { - /* - * Cargo-cult programming -- don't really need this to be zero, but - * creating new pages is an infrequent occurrence and it makes me feel - * good when I know they're empty. - */ - - memset(page, 0, size); - - PageInit(page, size, sizeof(BTPageOpaqueData)); + + /* + * Cargo-cult programming -- don't really need this to be zero, but + * creating new pages is an infrequent occurrence and it makes me feel + * good when I know they're empty. + */ + + memset(page, 0, size); + + PageInit(page, size, sizeof(BTPageOpaqueData)); } /* - * _bt_metaproot() -- Change the root page of the btree. + * _bt_metaproot() -- Change the root page of the btree. * - * Lehman and Yao require that the root page move around in order to - * guarantee deadlock-free short-term, fine-granularity locking. When - * we split the root page, we record the new parent in the metadata page - * for the relation. This routine does the work. + * Lehman and Yao require that the root page move around in order to + * guarantee deadlock-free short-term, fine-granularity locking. When + * we split the root page, we record the new parent in the metadata page + * for the relation. This routine does the work. * - * No direct preconditions, but if you don't have the a write lock on - * at least the old root page when you call this, you're making a big - * mistake. On exit, metapage data is correct and we no longer have - * a reference to or lock on the metapage. + * No direct preconditions, but if you don't have the a write lock on + * at least the old root page when you call this, you're making a big + * mistake. On exit, metapage data is correct and we no longer have + * a reference to or lock on the metapage. */ void _bt_metaproot(Relation rel, BlockNumber rootbknum, int level) { - Buffer metabuf; - Page metap; - BTPageOpaque metaopaque; - BTMetaPageData *metad; - - metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); - metap = BufferGetPage(metabuf); - metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); - Assert(metaopaque->btpo_flags & BTP_META); - metad = BTPageGetMeta(metap); - metad->btm_root = rootbknum; + Buffer metabuf; + Page metap; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); + metap = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap); + Assert(metaopaque->btpo_flags & BTP_META); + metad = BTPageGetMeta(metap); + metad->btm_root = rootbknum; #ifdef BTREE_VERSION_1 - if ( level == 0 ) /* called from _do_insert */ - metad->btm_level += 1; - else - metad->btm_level = level; /* called from btsort */ + if (level == 0) /* called from _do_insert */ + metad->btm_level += 1; + else + metad->btm_level = level; /* called from btsort */ #endif - _bt_wrtbuf(rel, metabuf); + _bt_wrtbuf(rel, metabuf); } /* - * _bt_getstackbuf() -- Walk back up the tree one step, and find the item - * we last looked at in the parent. + * _bt_getstackbuf() -- Walk back up the tree one step, and find the item + * we last looked at in the parent. * - * This is possible because we save a bit image of the last item - * we looked at in the parent, and the update algorithm guarantees - * that if items above us in the tree move, they only move right. + * This is possible because we save a bit image of the last item + * we looked at in the parent, and the update algorithm guarantees + * that if items above us in the tree move, they only move right. * - * Also, re-set bts_blkno & bts_offset if changed and - * bts_btitem (it may be changed - see _bt_insertonpg). + * Also, re-set bts_blkno & bts_offset if changed and + * bts_btitem (it may be changed - see _bt_insertonpg). */ Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access) { - Buffer buf; - BlockNumber blkno; - OffsetNumber start, offnum, maxoff; - OffsetNumber i; - Page page; - ItemId itemid; - BTItem item; - BTPageOpaque opaque; - BTItem item_save; - int item_nbytes; - - blkno = stack->bts_blkno; - buf = _bt_getbuf(rel, blkno, access); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - - if (maxoff >= stack->bts_offset) { - itemid = PageGetItemId(page, stack->bts_offset); - item = (BTItem) PageGetItem(page, itemid); - - /* if the item is where we left it, we're done */ - if ( BTItemSame (item, stack->bts_btitem) ) - { - pfree(stack->bts_btitem); - item_nbytes = ItemIdGetLength(itemid); - item_save = (BTItem) palloc(item_nbytes); - memmove((char *) item_save, (char *) item, item_nbytes); - stack->bts_btitem = item_save; - return (buf); - } - - /* if the item has just moved right on this page, we're done */ - for (i = OffsetNumberNext(stack->bts_offset); - i <= maxoff; - i = OffsetNumberNext(i)) { - itemid = PageGetItemId(page, i); - item = (BTItem) PageGetItem(page, itemid); - - /* if the item is where we left it, we're done */ - if ( BTItemSame (item, stack->bts_btitem) ) - { - stack->bts_offset = i; - pfree(stack->bts_btitem); - item_nbytes = ItemIdGetLength(itemid); - item_save = (BTItem) palloc(item_nbytes); - memmove((char *) item_save, (char *) item, item_nbytes); - stack->bts_btitem = item_save; - return (buf); - } - } - } - - /* by here, the item we're looking for moved right at least one page */ - for (;;) { - blkno = opaque->btpo_next; - if (P_RIGHTMOST(opaque)) - elog(FATAL, "my bits moved right off the end of the world!"); - - _bt_relbuf(rel, buf, access); + Buffer buf; + BlockNumber blkno; + OffsetNumber start, + offnum, + maxoff; + OffsetNumber i; + Page page; + ItemId itemid; + BTItem item; + BTPageOpaque opaque; + BTItem item_save; + int item_nbytes; + + blkno = stack->bts_blkno; buf = _bt_getbuf(rel, blkno, access); page = BufferGetPage(buf); - maxoff = PageGetMaxOffsetNumber(page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* if we have a right sibling, step over the high key */ - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* see if it's on this page */ - for (offnum = start; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) { - itemid = PageGetItemId(page, offnum); - item = (BTItem) PageGetItem(page, itemid); - if ( BTItemSame (item, stack->bts_btitem) ) - { - stack->bts_offset = offnum; - stack->bts_blkno = blkno; - pfree(stack->bts_btitem); - item_nbytes = ItemIdGetLength(itemid); - item_save = (BTItem) palloc(item_nbytes); - memmove((char *) item_save, (char *) item, item_nbytes); - stack->bts_btitem = item_save; - return (buf); - } + maxoff = PageGetMaxOffsetNumber(page); + + if (maxoff >= stack->bts_offset) + { + itemid = PageGetItemId(page, stack->bts_offset); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (BTItemSame(item, stack->bts_btitem)) + { + pfree(stack->bts_btitem); + item_nbytes = ItemIdGetLength(itemid); + item_save = (BTItem) palloc(item_nbytes); + memmove((char *) item_save, (char *) item, item_nbytes); + stack->bts_btitem = item_save; + return (buf); + } + + /* if the item has just moved right on this page, we're done */ + for (i = OffsetNumberNext(stack->bts_offset); + i <= maxoff; + i = OffsetNumberNext(i)) + { + itemid = PageGetItemId(page, i); + item = (BTItem) PageGetItem(page, itemid); + + /* if the item is where we left it, we're done */ + if (BTItemSame(item, stack->bts_btitem)) + { + stack->bts_offset = i; + pfree(stack->bts_btitem); + item_nbytes = ItemIdGetLength(itemid); + item_save = (BTItem) palloc(item_nbytes); + memmove((char *) item_save, (char *) item, item_nbytes); + stack->bts_btitem = item_save; + return (buf); + } + } + } + + /* by here, the item we're looking for moved right at least one page */ + for (;;) + { + blkno = opaque->btpo_next; + if (P_RIGHTMOST(opaque)) + elog(FATAL, "my bits moved right off the end of the world!"); + + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, blkno, access); + page = BufferGetPage(buf); + maxoff = PageGetMaxOffsetNumber(page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we have a right sibling, step over the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* see if it's on this page */ + for (offnum = start; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + itemid = PageGetItemId(page, offnum); + item = (BTItem) PageGetItem(page, itemid); + if (BTItemSame(item, stack->bts_btitem)) + { + stack->bts_offset = offnum; + stack->bts_blkno = blkno; + pfree(stack->bts_btitem); + item_nbytes = ItemIdGetLength(itemid); + item_save = (BTItem) palloc(item_nbytes); + memmove((char *) item_save, (char *) item, item_nbytes); + stack->bts_btitem = item_save; + return (buf); + } + } } - } } static void _bt_setpagelock(Relation rel, BlockNumber blkno, int access) { - ItemPointerData iptr; - - if (USELOCKING) { - ItemPointerSet(&iptr, blkno, P_HIKEY); - - if (access == BT_WRITE) - RelationSetSingleWLockPage(rel, &iptr); - else - RelationSetSingleRLockPage(rel, &iptr); - } + ItemPointerData iptr; + + if (USELOCKING) + { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationSetSingleWLockPage(rel, &iptr); + else + RelationSetSingleRLockPage(rel, &iptr); + } } static void _bt_unsetpagelock(Relation rel, BlockNumber blkno, int access) { - ItemPointerData iptr; - - if (USELOCKING) { - ItemPointerSet(&iptr, blkno, P_HIKEY); - - if (access == BT_WRITE) - RelationUnsetSingleWLockPage(rel, &iptr); - else - RelationUnsetSingleRLockPage(rel, &iptr); - } + ItemPointerData iptr; + + if (USELOCKING) + { + ItemPointerSet(&iptr, blkno, P_HIKEY); + + if (access == BT_WRITE) + RelationUnsetSingleWLockPage(rel, &iptr); + else + RelationUnsetSingleRLockPage(rel, &iptr); + } } void _bt_pagedel(Relation rel, ItemPointer tid) { - Buffer buf; - Page page; - BlockNumber blkno; - OffsetNumber offno; - - blkno = ItemPointerGetBlockNumber(tid); - offno = ItemPointerGetOffsetNumber(tid); - - buf = _bt_getbuf(rel, blkno, BT_WRITE); - page = BufferGetPage(buf); - - PageIndexTupleDelete(page, offno); - - /* write the buffer and release the lock */ - _bt_wrtbuf(rel, buf); + Buffer buf; + Page page; + BlockNumber blkno; + OffsetNumber offno; + + blkno = ItemPointerGetBlockNumber(tid); + offno = ItemPointerGetOffsetNumber(tid); + + buf = _bt_getbuf(rel, blkno, BT_WRITE); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offno); + + /* write the buffer and release the lock */ + _bt_wrtbuf(rel, buf); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b672901f8db..dccbd77b355 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1,17 +1,17 @@ /*------------------------------------------------------------------------- * * btree.c-- - * Implementation of Lehman and Yao's btree management algorithm for - * Postgres. + * Implementation of Lehman and Yao's btree management algorithm for + * Postgres. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.19 1997/05/05 03:41:17 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.20 1997/09/07 04:38:54 momjian Exp $ * * NOTES - * This file contains only the public interface routines. + * This file contains only the public interface routines. * *------------------------------------------------------------------------- */ @@ -28,546 +28,579 @@ #include <miscadmin.h> #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif #ifdef BTREE_BUILD_STATS #include <tcop/tcopprot.h> -extern int ShowExecutorStats; +extern int ShowExecutorStats; + #endif -bool BuildingBtree = false; /* see comment in btbuild() */ -bool FastBuild = true; /* use sort/build instead of insertion build */ +bool BuildingBtree = false; /* see comment in btbuild() */ +bool FastBuild = true; /* use sort/build instead of + * insertion build */ /* - * btbuild() -- build a new btree index. + * btbuild() -- build a new btree index. * - * We use a global variable to record the fact that we're creating - * a new index. This is used to avoid high-concurrency locking, - * since the index won't be visible until this transaction commits - * and since building is guaranteed to be single-threaded. + * We use a global variable to record the fact that we're creating + * a new index. This is used to avoid high-concurrency locking, + * since the index won't be visible until this transaction commits + * and since building is guaranteed to be single-threaded. */ void btbuild(Relation heap, - Relation index, - int natts, - AttrNumber *attnum, - IndexStrategy istrat, - uint16 pcount, - Datum *params, - FuncIndexInfo *finfo, - PredInfo *predInfo) + Relation index, + int natts, + AttrNumber * attnum, + IndexStrategy istrat, + uint16 pcount, + Datum * params, + FuncIndexInfo * finfo, + PredInfo * predInfo) { - HeapScanDesc hscan; - Buffer buffer; - HeapTuple htup; - IndexTuple itup; - TupleDesc htupdesc, itupdesc; - Datum *attdata; - bool *nulls; - InsertIndexResult res = 0; - int nhtups, nitups; - int i; - BTItem btitem; + HeapScanDesc hscan; + Buffer buffer; + HeapTuple htup; + IndexTuple itup; + TupleDesc htupdesc, + itupdesc; + Datum *attdata; + bool *nulls; + InsertIndexResult res = 0; + int nhtups, + nitups; + int i; + BTItem btitem; + #ifndef OMIT_PARTIAL_INDEX - ExprContext *econtext = (ExprContext *) NULL; - TupleTable tupleTable = (TupleTable) NULL; - TupleTableSlot *slot = (TupleTableSlot *) NULL; -#endif - Oid hrelid, irelid; - Node *pred, *oldPred; - void *spool = (void *) NULL; - bool isunique; - bool usefast; - - /* note that this is a new btree */ - BuildingBtree = true; - - pred = predInfo->pred; - oldPred = predInfo->oldPred; - - /* - * bootstrap processing does something strange, so don't use - * sort/build for initial catalog indices. at some point i need - * to look harder at this. (there is some kind of incremental - * processing going on there.) -- pma 08/29/95 - */ - usefast = (FastBuild && IsNormalProcessingMode()); + ExprContext *econtext = (ExprContext *) NULL; + TupleTable tupleTable = (TupleTable) NULL; + TupleTableSlot *slot = (TupleTableSlot *) NULL; -#ifdef BTREE_BUILD_STATS - if ( ShowExecutorStats ) - ResetUsage (); #endif + Oid hrelid, + irelid; + Node *pred, + *oldPred; + void *spool = (void *) NULL; + bool isunique; + bool usefast; - /* see if index is unique */ - isunique = IndexIsUniqueNoCache(RelationGetRelationId(index)); - - /* initialize the btree index metadata page (if this is a new index) */ - if (oldPred == NULL) - _bt_metapinit(index); - - /* get tuple descriptors for heap and index relations */ - htupdesc = RelationGetTupleDescriptor(heap); - itupdesc = RelationGetTupleDescriptor(index); - - /* get space for data items that'll appear in the index tuple */ - attdata = (Datum *) palloc(natts * sizeof(Datum)); - nulls = (bool *) palloc(natts * sizeof(bool)); - - /* - * If this is a predicate (partial) index, we will need to evaluate the - * predicate using ExecQual, which requires the current tuple to be in a - * slot of a TupleTable. In addition, ExecQual must have an ExprContext - * referring to that slot. Here, we initialize dummy TupleTable and - * ExprContext objects for this purpose. --Nels, Feb '92 - */ -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) { - tupleTable = ExecCreateTupleTable(1); - slot = ExecAllocTableSlot(tupleTable); - econtext = makeNode(ExprContext); - FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer); + /* note that this is a new btree */ + BuildingBtree = true; + + pred = predInfo->pred; + oldPred = predInfo->oldPred; /* - * we never want to use sort/build if we are extending an - * existing partial index -- it works by inserting the - * newly-qualifying tuples into the existing index. - * (sort/build would overwrite the existing index with one - * consisting of the newly-qualifying tuples.) + * bootstrap processing does something strange, so don't use + * sort/build for initial catalog indices. at some point i need to + * look harder at this. (there is some kind of incremental processing + * going on there.) -- pma 08/29/95 */ - usefast = false; - } -#endif /* OMIT_PARTIAL_INDEX */ - - /* start a heap scan */ - hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL); - htup = heap_getnext(hscan, 0, &buffer); - - /* build the index */ - nhtups = nitups = 0; - - if (usefast) { - spool = _bt_spoolinit(index, 7, isunique); - res = (InsertIndexResult) NULL; - } - - for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) { - - nhtups++; - + usefast = (FastBuild && IsNormalProcessingMode()); + +#ifdef BTREE_BUILD_STATS + if (ShowExecutorStats) + ResetUsage(); +#endif + + /* see if index is unique */ + isunique = IndexIsUniqueNoCache(RelationGetRelationId(index)); + + /* initialize the btree index metadata page (if this is a new index) */ + if (oldPred == NULL) + _bt_metapinit(index); + + /* get tuple descriptors for heap and index relations */ + htupdesc = RelationGetTupleDescriptor(heap); + itupdesc = RelationGetTupleDescriptor(index); + + /* get space for data items that'll appear in the index tuple */ + attdata = (Datum *) palloc(natts * sizeof(Datum)); + nulls = (bool *) palloc(natts * sizeof(bool)); + /* - * If oldPred != NULL, this is an EXTEND INDEX command, so skip - * this tuple if it was already in the existing partial index + * If this is a predicate (partial) index, we will need to evaluate + * the predicate using ExecQual, which requires the current tuple to + * be in a slot of a TupleTable. In addition, ExecQual must have an + * ExprContext referring to that slot. Here, we initialize dummy + * TupleTable and ExprContext objects for this purpose. --Nels, Feb + * '92 */ - if (oldPred != NULL) { +#ifndef OMIT_PARTIAL_INDEX + if (pred != NULL || oldPred != NULL) + { + tupleTable = ExecCreateTupleTable(1); + slot = ExecAllocTableSlot(tupleTable); + econtext = makeNode(ExprContext); + FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer); + + /* + * we never want to use sort/build if we are extending an existing + * partial index -- it works by inserting the newly-qualifying + * tuples into the existing index. (sort/build would overwrite the + * existing index with one consisting of the newly-qualifying + * tuples.) + */ + usefast = false; + } +#endif /* OMIT_PARTIAL_INDEX */ + + /* start a heap scan */ + hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL); + htup = heap_getnext(hscan, 0, &buffer); + + /* build the index */ + nhtups = nitups = 0; + + if (usefast) + { + spool = _bt_spoolinit(index, 7, isunique); + res = (InsertIndexResult) NULL; + } + + for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) + { + + nhtups++; + + /* + * If oldPred != NULL, this is an EXTEND INDEX command, so skip + * this tuple if it was already in the existing partial index + */ + if (oldPred != NULL) + { #ifndef OMIT_PARTIAL_INDEX - /*SetSlotContents(slot, htup);*/ - slot->val = htup; - if (ExecQual((List*)oldPred, econtext) == true) { + /* SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List *) oldPred, econtext) == true) + { + nitups++; + continue; + } +#endif /* OMIT_PARTIAL_INDEX */ + } + + /* + * Skip this tuple if it doesn't satisfy the partial-index + * predicate + */ + if (pred != NULL) + { +#ifndef OMIT_PARTIAL_INDEX + /* SetSlotContents(slot, htup); */ + slot->val = htup; + if (ExecQual((List *) pred, econtext) == false) + continue; +#endif /* OMIT_PARTIAL_INDEX */ + } + nitups++; - continue; - } -#endif /* OMIT_PARTIAL_INDEX */ + + /* + * For the current heap tuple, extract all the attributes we use + * in this index, and note which are null. + */ + + for (i = 1; i <= natts; i++) + { + int attoff; + bool attnull; + + /* + * Offsets are from the start of the tuple, and are + * zero-based; indices are one-based. The next call returns i + * - 1. That's data hiding for you. + */ + + attoff = AttrNumberGetAttrOffset(i); + attdata[attoff] = GetIndexValue(htup, + htupdesc, + attoff, + attnum, + finfo, + &attnull, + buffer); + nulls[attoff] = (attnull ? 'n' : ' '); + } + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(itupdesc, attdata, nulls); + + /* + * If the single index key is null, we don't insert it into the + * index. Btrees support scans on <, <=, =, >=, and >. Relational + * algebra says that A op B (where op is one of the operators + * above) returns null if either A or B is null. This means that + * no qualification used in an index scan could ever return true + * on a null attribute. It also means that indices can't be used + * by ISNULL or NOTNULL scans, but that's an artifact of the + * strategy map architecture chosen in 1986, not of the way nulls + * are handled here. + */ + + /* + * New comments: NULLs handling. While we can't do NULL + * comparison, we can follow simple rule for ordering items on + * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE. + * Sure, it's just rule for placing/finding items and no more - + * keytest'll return FALSE for a = 5 for items having 'a' isNULL. + * Look at _bt_skeycmp, _bt_compare and _bt_itemcmp for how it + * works. - vadim 03/23/97 + * + * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; } + */ + + itup->t_tid = htup->t_ctid; + btitem = _bt_formitem(itup); + + /* + * if we are doing bottom-up btree build, we insert the index into + * a spool page for subsequent processing. otherwise, we insert + * into the btree. + */ + if (usefast) + { + _bt_spool(index, btitem, spool); + } + else + { + res = _bt_doinsert(index, btitem, isunique, heap); + } + + pfree(btitem); + pfree(itup); + if (res) + { + pfree(res); + } } - - /* Skip this tuple if it doesn't satisfy the partial-index predicate */ - if (pred != NULL) { + + /* okay, all heap tuples are indexed */ + heap_endscan(hscan); + + if (pred != NULL || oldPred != NULL) + { #ifndef OMIT_PARTIAL_INDEX - /* SetSlotContents(slot, htup); */ - slot->val = htup; - if (ExecQual((List*)pred, econtext) == false) - continue; -#endif /* OMIT_PARTIAL_INDEX */ + ExecDestroyTupleTable(tupleTable, true); + pfree(econtext); +#endif /* OMIT_PARTIAL_INDEX */ } - - nitups++; - + /* - * For the current heap tuple, extract all the attributes - * we use in this index, and note which are null. + * if we are doing bottom-up btree build, we now have a bunch of + * sorted runs in the spool pages. finish the build by (1) merging + * the runs, (2) inserting the sorted tuples into btree pages and (3) + * building the upper levels. */ - - for (i = 1; i <= natts; i++) { - int attoff; - bool attnull; - - /* - * Offsets are from the start of the tuple, and are - * zero-based; indices are one-based. The next call - * returns i - 1. That's data hiding for you. - */ - - attoff = AttrNumberGetAttrOffset(i); - attdata[attoff] = GetIndexValue(htup, - htupdesc, - attoff, - attnum, - finfo, - &attnull, - buffer); - nulls[attoff] = (attnull ? 'n' : ' '); + if (usefast) + { + _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */ + _bt_leafbuild(index, spool); + _bt_spooldestroy(spool); } - - /* form an index tuple and point it at the heap tuple */ - itup = index_formtuple(itupdesc, attdata, nulls); - - /* - * If the single index key is null, we don't insert it into - * the index. Btrees support scans on <, <=, =, >=, and >. - * Relational algebra says that A op B (where op is one of the - * operators above) returns null if either A or B is null. This - * means that no qualification used in an index scan could ever - * return true on a null attribute. It also means that indices - * can't be used by ISNULL or NOTNULL scans, but that's an - * artifact of the strategy map architecture chosen in 1986, not - * of the way nulls are handled here. - */ - /* - * New comments: NULLs handling. - * While we can't do NULL comparison, we can follow simple - * rule for ordering items on btree pages - NULLs greater - * NOT_NULLs and NULL = NULL is TRUE. Sure, it's just rule - * for placing/finding items and no more - keytest'll return - * FALSE for a = 5 for items having 'a' isNULL. - * Look at _bt_skeycmp, _bt_compare and _bt_itemcmp for - * how it works. - vadim 03/23/97 - - if (itup->t_info & INDEX_NULL_MASK) { - pfree(itup); - continue; + +#ifdef BTREE_BUILD_STATS + if (ShowExecutorStats) + { + fprintf(stderr, "! BtreeBuild Stats:\n"); + ShowUsage(); + ResetUsage(); } - */ - - itup->t_tid = htup->t_ctid; - btitem = _bt_formitem(itup); +#endif /* - * if we are doing bottom-up btree build, we insert the index - * into a spool page for subsequent processing. otherwise, we - * insert into the btree. + * Since we just counted the tuples in the heap, we update its stats + * in pg_class to guarantee that the planner takes advantage of the + * index we just created. Finally, only update statistics during + * normal index definitions, not for indices on system catalogs + * created during bootstrap processing. We must close the relations + * before updatings statistics to guarantee that the relcache entries + * are flushed when we increment the command counter in UpdateStats(). */ - if (usefast) { - _bt_spool(index, btitem, spool); - } else { - res = _bt_doinsert(index, btitem, isunique, heap); + if (IsNormalProcessingMode()) + { + hrelid = heap->rd_id; + irelid = index->rd_id; + heap_close(heap); + index_close(index); + UpdateStats(hrelid, nhtups, true); + UpdateStats(irelid, nitups, false); + if (oldPred != NULL) + { + if (nitups == nhtups) + pred = NULL; + UpdateIndexPredicate(irelid, oldPred, pred); + } } - pfree(btitem); - pfree(itup); - if (res) { - pfree(res); - } - } - - /* okay, all heap tuples are indexed */ - heap_endscan(hscan); - - if (pred != NULL || oldPred != NULL) { -#ifndef OMIT_PARTIAL_INDEX - ExecDestroyTupleTable(tupleTable, true); - pfree(econtext); -#endif /* OMIT_PARTIAL_INDEX */ - } - - /* - * if we are doing bottom-up btree build, we now have a bunch of - * sorted runs in the spool pages. finish the build by (1) - * merging the runs, (2) inserting the sorted tuples into btree - * pages and (3) building the upper levels. - */ - if (usefast) { - _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */ - _bt_leafbuild(index, spool); - _bt_spooldestroy(spool); - } + pfree(nulls); + pfree(attdata); -#ifdef BTREE_BUILD_STATS - if ( ShowExecutorStats ) - { - fprintf(stderr, "! BtreeBuild Stats:\n"); - ShowUsage (); - ResetUsage (); - } -#endif - - /* - * Since we just counted the tuples in the heap, we update its - * stats in pg_class to guarantee that the planner takes advantage - * of the index we just created. Finally, only update statistics - * during normal index definitions, not for indices on system catalogs - * created during bootstrap processing. We must close the relations - * before updatings statistics to guarantee that the relcache entries - * are flushed when we increment the command counter in UpdateStats(). - */ - if (IsNormalProcessingMode()) - { - hrelid = heap->rd_id; - irelid = index->rd_id; - heap_close(heap); - index_close(index); - UpdateStats(hrelid, nhtups, true); - UpdateStats(irelid, nitups, false); - if (oldPred != NULL) { - if (nitups == nhtups) pred = NULL; - UpdateIndexPredicate(irelid, oldPred, pred); - } - } - - pfree(nulls); - pfree(attdata); - - /* all done */ - BuildingBtree = false; + /* all done */ + BuildingBtree = false; } /* - * btinsert() -- insert an index tuple into a btree. + * btinsert() -- insert an index tuple into a btree. * - * Descend the tree recursively, find the appropriate location for our - * new tuple, put it there, set its unique OID as appropriate, and - * return an InsertIndexResult to the caller. + * Descend the tree recursively, find the appropriate location for our + * new tuple, put it there, set its unique OID as appropriate, and + * return an InsertIndexResult to the caller. */ InsertIndexResult -btinsert(Relation rel, Datum *datum, char *nulls, ItemPointer ht_ctid, Relation heapRel) +btinsert(Relation rel, Datum * datum, char *nulls, ItemPointer ht_ctid, Relation heapRel) { - BTItem btitem; - IndexTuple itup; - InsertIndexResult res; - - /* generate an index tuple */ - itup = index_formtuple(RelationGetTupleDescriptor(rel), datum, nulls); - itup->t_tid = *ht_ctid; - - /* - * See comments in btbuild. - - if (itup->t_info & INDEX_NULL_MASK) - return ((InsertIndexResult) NULL); - */ - - btitem = _bt_formitem(itup); - - res = _bt_doinsert(rel, btitem, - IndexIsUnique(RelationGetRelationId(rel)), heapRel); - - pfree(btitem); - pfree(itup); - - /* adjust any active scans that will be affected by this insertion */ - _bt_adjscans(rel, &(res->pointerData), BT_INSERT); - - return (res); + BTItem btitem; + IndexTuple itup; + InsertIndexResult res; + + /* generate an index tuple */ + itup = index_formtuple(RelationGetTupleDescriptor(rel), datum, nulls); + itup->t_tid = *ht_ctid; + + /* + * See comments in btbuild. + * + * if (itup->t_info & INDEX_NULL_MASK) return ((InsertIndexResult) NULL); + */ + + btitem = _bt_formitem(itup); + + res = _bt_doinsert(rel, btitem, + IndexIsUnique(RelationGetRelationId(rel)), heapRel); + + pfree(btitem); + pfree(itup); + + /* adjust any active scans that will be affected by this insertion */ + _bt_adjscans(rel, &(res->pointerData), BT_INSERT); + + return (res); } /* - * btgettuple() -- Get the next tuple in the scan. + * btgettuple() -- Get the next tuple in the scan. */ -char * +char * btgettuple(IndexScanDesc scan, ScanDirection dir) { - RetrieveIndexResult res; - - /* - * If we've already initialized this scan, we can just advance it - * in the appropriate direction. If we haven't done so yet, we - * call a routine to get the first item in the scan. - */ - - if (ItemPointerIsValid(&(scan->currentItemData))) - res = _bt_next(scan, dir); - else - res = _bt_first(scan, dir); - - return ((char *) res); + RetrieveIndexResult res; + + /* + * If we've already initialized this scan, we can just advance it in + * the appropriate direction. If we haven't done so yet, we call a + * routine to get the first item in the scan. + */ + + if (ItemPointerIsValid(&(scan->currentItemData))) + res = _bt_next(scan, dir); + else + res = _bt_first(scan, dir); + + return ((char *) res); } /* - * btbeginscan() -- start a scan on a btree index + * btbeginscan() -- start a scan on a btree index */ -char * +char * btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey) { - IndexScanDesc scan; - - /* get the scan */ - scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey); - - /* register scan in case we change pages it's using */ - _bt_regscan(scan); - - return ((char *) scan); + IndexScanDesc scan; + + /* get the scan */ + scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey); + + /* register scan in case we change pages it's using */ + _bt_regscan(scan); + + return ((char *) scan); } /* - * btrescan() -- rescan an index relation + * btrescan() -- rescan an index relation */ void btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey) { - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - - /* we hold a read lock on the current page in the scan */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { - _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - - /* and we hold a read lock on the last marked item in the scan */ - if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { - _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); - so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - - if ( so == NULL ) /* if called from btbeginscan */ - { - so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); - so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer; - so->keyData = (ScanKey) NULL; - if ( scan->numberOfKeys > 0) - so->keyData = (ScanKey) palloc (scan->numberOfKeys * sizeof(ScanKeyData)); - scan->opaque = so; - scan->flags = 0x0; - } - - /* - * Reset the scan keys. Note that keys ordering stuff - * moved to _bt_first. - vadim 05/05/97 - */ - so->numberOfKeys = scan->numberOfKeys; - if (scan->numberOfKeys > 0) { - memmove(scan->keyData, - scankey, - scan->numberOfKeys * sizeof(ScanKeyData)); - memmove(so->keyData, - scankey, - so->numberOfKeys * sizeof(ScanKeyData)); - } + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* we hold a read lock on the current page in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* and we hold a read lock on the last marked item in the scan */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) + { + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + if (so == NULL) /* if called from btbeginscan */ + { + so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); + so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer; + so->keyData = (ScanKey) NULL; + if (scan->numberOfKeys > 0) + so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); + scan->opaque = so; + scan->flags = 0x0; + } + + /* + * Reset the scan keys. Note that keys ordering stuff moved to + * _bt_first. - vadim 05/05/97 + */ + so->numberOfKeys = scan->numberOfKeys; + if (scan->numberOfKeys > 0) + { + memmove(scan->keyData, + scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + memmove(so->keyData, + scankey, + so->numberOfKeys * sizeof(ScanKeyData)); + } } void btmovescan(IndexScanDesc scan, Datum v) { - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - - /* release any locks we still hold */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { - _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - -/* scan->keyData[0].sk_argument = v; */ - so->keyData[0].sk_argument = v; + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + +/* scan->keyData[0].sk_argument = v; */ + so->keyData[0].sk_argument = v; } /* - * btendscan() -- close down a scan + * btendscan() -- close down a scan */ void btendscan(IndexScanDesc scan) { - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - - /* release any locks we still hold */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { - if (BufferIsValid(so->btso_curbuf)) - _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - - if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { - if (BufferIsValid(so->btso_mrkbuf)) - _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); - so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - - if ( so->keyData != (ScanKey) NULL ) - pfree (so->keyData); - pfree (so); - - _bt_dropscan(scan); + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release any locks we still hold */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + { + if (BufferIsValid(so->btso_curbuf)) + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) + { + if (BufferIsValid(so->btso_mrkbuf)) + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + if (so->keyData != (ScanKey) NULL) + pfree(so->keyData); + pfree(so); + + _bt_dropscan(scan); } /* - * btmarkpos() -- save current scan position + * btmarkpos() -- save current scan position */ void btmarkpos(IndexScanDesc scan) { - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - - /* release lock on old marked data, if any */ - if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) { - _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); - so->btso_mrkbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - - /* bump lock on currentItemData and copy to currentMarkData */ - if (ItemPointerIsValid(&(scan->currentItemData))) { - so->btso_mrkbuf = _bt_getbuf(scan->relation, - BufferGetBlockNumber(so->btso_curbuf), - BT_READ); - scan->currentMarkData = scan->currentItemData; - } + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release lock on old marked data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) + { + _bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ); + so->btso_mrkbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentItemData and copy to currentMarkData */ + if (ItemPointerIsValid(&(scan->currentItemData))) + { + so->btso_mrkbuf = _bt_getbuf(scan->relation, + BufferGetBlockNumber(so->btso_curbuf), + BT_READ); + scan->currentMarkData = scan->currentItemData; + } } /* - * btrestrpos() -- restore scan to last saved position + * btrestrpos() -- restore scan to last saved position */ void btrestrpos(IndexScanDesc scan) { - ItemPointer iptr; - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - - /* release lock on current data, if any */ - if (ItemPointerIsValid(iptr = &(scan->currentItemData))) { - _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(iptr); - } - - /* bump lock on currentMarkData and copy to currentItemData */ - if (ItemPointerIsValid(&(scan->currentMarkData))) { - so->btso_curbuf = _bt_getbuf(scan->relation, - BufferGetBlockNumber(so->btso_mrkbuf), - BT_READ); - - scan->currentItemData = scan->currentMarkData; - } + ItemPointer iptr; + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + + /* release lock on current data, if any */ + if (ItemPointerIsValid(iptr = &(scan->currentItemData))) + { + _bt_relbuf(scan->relation, so->btso_curbuf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(iptr); + } + + /* bump lock on currentMarkData and copy to currentItemData */ + if (ItemPointerIsValid(&(scan->currentMarkData))) + { + so->btso_curbuf = _bt_getbuf(scan->relation, + BufferGetBlockNumber(so->btso_mrkbuf), + BT_READ); + + scan->currentItemData = scan->currentMarkData; + } } /* stubs */ void btdelete(Relation rel, ItemPointer tid) { - /* adjust any active scans that will be affected by this deletion */ - _bt_adjscans(rel, tid, BT_DELETE); - - /* delete the data from the page */ - _bt_pagedel(rel, tid); + /* adjust any active scans that will be affected by this deletion */ + _bt_adjscans(rel, tid, BT_DELETE); + + /* delete the data from the page */ + _bt_pagedel(rel, tid); } diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c index 5e23fe13d7b..8a2042403ad 100644 --- a/src/backend/access/nbtree/nbtscan.c +++ b/src/backend/access/nbtree/nbtscan.c @@ -1,28 +1,28 @@ /*------------------------------------------------------------------------- * * btscan.c-- - * manage scans on btrees. + * manage scans on btrees. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.7 1997/02/18 17:13:45 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.8 1997/09/07 04:38:57 momjian Exp $ * * * NOTES - * Because we can be doing an index scan on a relation while we update - * it, we need to avoid missing data that moves around in the index. - * The routines and global variables in this file guarantee that all - * scans in the local address space stay correctly positioned. This - * is all we need to worry about, since write locking guarantees that - * no one else will be on the same page at the same time as we are. + * Because we can be doing an index scan on a relation while we update + * it, we need to avoid missing data that moves around in the index. + * The routines and global variables in this file guarantee that all + * scans in the local address space stay correctly positioned. This + * is all we need to worry about, since write locking guarantees that + * no one else will be on the same page at the same time as we are. * - * The scheme is to manage a list of active scans in the current backend. - * Whenever we add or remove records from an index, or whenever we - * split a leaf page, we check the list of active scans to see if any - * has been affected. A scan is affected only if it is on the same - * relation, and the same page, as the update. + * The scheme is to manage a list of active scans in the current backend. + * Whenever we add or remove records from an index, or whenever we + * split a leaf page, we check the list of active scans to see if any + * has been affected. A scan is affected only if it is on the same + * relation, and the same page, as the update. * *------------------------------------------------------------------------- */ @@ -32,83 +32,87 @@ #include <storage/bufpage.h> #include <access/nbtree.h> -typedef struct BTScanListData { - IndexScanDesc btsl_scan; - struct BTScanListData *btsl_next; -} BTScanListData; +typedef struct BTScanListData +{ + IndexScanDesc btsl_scan; + struct BTScanListData *btsl_next; +} BTScanListData; -typedef BTScanListData *BTScanList; +typedef BTScanListData *BTScanList; -static BTScanList BTScans = (BTScanList) NULL; +static BTScanList BTScans = (BTScanList) NULL; -static void _bt_scandel(IndexScanDesc scan, int op, BlockNumber blkno, OffsetNumber offno); -static bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno); +static void _bt_scandel(IndexScanDesc scan, int op, BlockNumber blkno, OffsetNumber offno); +static bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno); /* - * _bt_regscan() -- register a new scan. + * _bt_regscan() -- register a new scan. */ void _bt_regscan(IndexScanDesc scan) { - BTScanList new_el; - - new_el = (BTScanList) palloc(sizeof(BTScanListData)); - new_el->btsl_scan = scan; - new_el->btsl_next = BTScans; - BTScans = new_el; + BTScanList new_el; + + new_el = (BTScanList) palloc(sizeof(BTScanListData)); + new_el->btsl_scan = scan; + new_el->btsl_next = BTScans; + BTScans = new_el; } /* - * _bt_dropscan() -- drop a scan from the scan list + * _bt_dropscan() -- drop a scan from the scan list */ void _bt_dropscan(IndexScanDesc scan) { - BTScanList chk, last; - - last = (BTScanList) NULL; - for (chk = BTScans; - chk != (BTScanList) NULL && chk->btsl_scan != scan; - chk = chk->btsl_next) { - last = chk; - } - - if (chk == (BTScanList) NULL) - elog(WARN, "btree scan list trashed; can't find 0x%lx", scan); - - if (last == (BTScanList) NULL) - BTScans = chk->btsl_next; - else - last->btsl_next = chk->btsl_next; - - pfree (chk); + BTScanList chk, + last; + + last = (BTScanList) NULL; + for (chk = BTScans; + chk != (BTScanList) NULL && chk->btsl_scan != scan; + chk = chk->btsl_next) + { + last = chk; + } + + if (chk == (BTScanList) NULL) + elog(WARN, "btree scan list trashed; can't find 0x%lx", scan); + + if (last == (BTScanList) NULL) + BTScans = chk->btsl_next; + else + last->btsl_next = chk->btsl_next; + + pfree(chk); } /* - * _bt_adjscans() -- adjust all scans in the scan list to compensate - * for a given deletion or insertion + * _bt_adjscans() -- adjust all scans in the scan list to compensate + * for a given deletion or insertion */ void _bt_adjscans(Relation rel, ItemPointer tid, int op) { - BTScanList l; - Oid relid; - - relid = rel->rd_id; - for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) { - if (relid == l->btsl_scan->relation->rd_id) - _bt_scandel(l->btsl_scan, op, - ItemPointerGetBlockNumber(tid), - ItemPointerGetOffsetNumber(tid)); - } + BTScanList l; + Oid relid; + + relid = rel->rd_id; + for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) + { + if (relid == l->btsl_scan->relation->rd_id) + _bt_scandel(l->btsl_scan, op, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + } } /* - * _bt_scandel() -- adjust a single scan + * _bt_scandel() -- adjust a single scan * * because each index page is always maintained as an ordered array of * index tuples, the index tuples on a given page shift beneath any - * given scan. an index modification "behind" a scan position (i.e., + * given scan. an index modification "behind" a scan position (i.e., * same page, lower or equal offset number) will therefore force us to * adjust the scan in the following ways: * @@ -126,80 +130,85 @@ _bt_adjscans(Relation rel, ItemPointer tid, int op) static void _bt_scandel(IndexScanDesc scan, int op, BlockNumber blkno, OffsetNumber offno) { - ItemPointer current; - Buffer buf; - BTScanOpaque so; - - if (!_bt_scantouched(scan, blkno, offno)) - return; - - so = (BTScanOpaque) scan->opaque; - buf = so->btso_curbuf; - - current = &(scan->currentItemData); - if (ItemPointerIsValid(current) - && ItemPointerGetBlockNumber(current) == blkno - && ItemPointerGetOffsetNumber(current) >= offno) { - switch (op) { - case BT_INSERT: - _bt_step(scan, &buf, ForwardScanDirection); - break; - case BT_DELETE: - _bt_step(scan, &buf, BackwardScanDirection); - break; - default: - elog(WARN, "_bt_scandel: bad operation '%d'", op); - /*NOTREACHED*/ + ItemPointer current; + Buffer buf; + BTScanOpaque so; + + if (!_bt_scantouched(scan, blkno, offno)) + return; + + so = (BTScanOpaque) scan->opaque; + buf = so->btso_curbuf; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + { + switch (op) + { + case BT_INSERT: + _bt_step(scan, &buf, ForwardScanDirection); + break; + case BT_DELETE: + _bt_step(scan, &buf, BackwardScanDirection); + break; + default: + elog(WARN, "_bt_scandel: bad operation '%d'", op); + /* NOTREACHED */ + } + so->btso_curbuf = buf; } - so->btso_curbuf = buf; - } - - current = &(scan->currentMarkData); - if (ItemPointerIsValid(current) - && ItemPointerGetBlockNumber(current) == blkno - && ItemPointerGetOffsetNumber(current) >= offno) { - ItemPointerData tmp; - tmp = *current; - *current = scan->currentItemData; - scan->currentItemData = tmp; - switch (op) { - case BT_INSERT: - _bt_step(scan, &buf, ForwardScanDirection); - break; - case BT_DELETE: - _bt_step(scan, &buf, BackwardScanDirection); - break; - default: - elog(WARN, "_bt_scandel: bad operation '%d'", op); - /*NOTREACHED*/ + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + { + ItemPointerData tmp; + + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; + switch (op) + { + case BT_INSERT: + _bt_step(scan, &buf, ForwardScanDirection); + break; + case BT_DELETE: + _bt_step(scan, &buf, BackwardScanDirection); + break; + default: + elog(WARN, "_bt_scandel: bad operation '%d'", op); + /* NOTREACHED */ + } + so->btso_mrkbuf = buf; + tmp = *current; + *current = scan->currentItemData; + scan->currentItemData = tmp; } - so->btso_mrkbuf = buf; - tmp = *current; - *current = scan->currentItemData; - scan->currentItemData = tmp; - } } /* - * _bt_scantouched() -- check to see if a scan is affected by a given - * change to the index + * _bt_scantouched() -- check to see if a scan is affected by a given + * change to the index */ -static bool +static bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) { - ItemPointer current; - - current = &(scan->currentItemData); - if (ItemPointerIsValid(current) - && ItemPointerGetBlockNumber(current) == blkno - && ItemPointerGetOffsetNumber(current) >= offno) - return (true); - - current = &(scan->currentMarkData); - if (ItemPointerIsValid(current) - && ItemPointerGetBlockNumber(current) == blkno - && ItemPointerGetOffsetNumber(current) >= offno) - return (true); - - return (false); + ItemPointer current; + + current = &(scan->currentItemData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + current = &(scan->currentMarkData); + if (ItemPointerIsValid(current) + && ItemPointerGetBlockNumber(current) == blkno + && ItemPointerGetOffsetNumber(current) >= offno) + return (true); + + return (false); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 1d1c8072b93..8b1f75b7533 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * btsearch.c-- - * search code for postgres btrees. + * search code for postgres btrees. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.23 1997/08/19 21:29:42 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.24 1997/09/07 04:38:58 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -22,1435 +22,1516 @@ #include <catalog/pg_proc.h> #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif -static BTStack -_bt_searchr(Relation rel, int keysz, ScanKey scankey, - Buffer *bufP, BTStack stack_in); -static OffsetNumber -_bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, - Size keysz, ScanKey scankey, OffsetNumber offnum); -static int -_bt_compare(Relation rel, TupleDesc itupdesc, Page page, - int keysz, ScanKey scankey, OffsetNumber offnum); -static bool -_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); -static RetrieveIndexResult -_bt_endpoint(IndexScanDesc scan, ScanDirection dir); +static BTStack +_bt_searchr(Relation rel, int keysz, ScanKey scankey, + Buffer * bufP, BTStack stack_in); +static OffsetNumber +_bt_firsteq(Relation rel, TupleDesc itupdesc, Page page, + Size keysz, ScanKey scankey, OffsetNumber offnum); +static int +_bt_compare(Relation rel, TupleDesc itupdesc, Page page, + int keysz, ScanKey scankey, OffsetNumber offnum); +static bool + _bt_twostep(IndexScanDesc scan, Buffer * bufP, ScanDirection dir); +static RetrieveIndexResult + _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* - * _bt_search() -- Search for a scan key in the index. + * _bt_search() -- Search for a scan key in the index. * - * This routine is actually just a helper that sets things up and - * calls a recursive-descent search routine on the tree. + * This routine is actually just a helper that sets things up and + * calls a recursive-descent search routine on the tree. */ BTStack -_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP) +_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer * bufP) { - *bufP = _bt_getroot(rel, BT_READ); - return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL)); + *bufP = _bt_getroot(rel, BT_READ); + return (_bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL)); } /* - * _bt_searchr() -- Search the tree recursively for a particular scankey. + * _bt_searchr() -- Search the tree recursively for a particular scankey. */ -static BTStack +static BTStack _bt_searchr(Relation rel, - int keysz, - ScanKey scankey, - Buffer *bufP, - BTStack stack_in) + int keysz, + ScanKey scankey, + Buffer * bufP, + BTStack stack_in) { - BTStack stack; - OffsetNumber offnum; - Page page; - BTPageOpaque opaque; - BlockNumber par_blkno; - BlockNumber blkno; - ItemId itemid; - BTItem btitem; - BTItem item_save; - int item_nbytes; - IndexTuple itup; - - /* if this is a leaf page, we're done */ - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (opaque->btpo_flags & BTP_LEAF) - return (stack_in); - - /* - * Find the appropriate item on the internal page, and get the child - * page that it points to. - */ - - par_blkno = BufferGetBlockNumber(*bufP); - offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT); - itemid = PageGetItemId(page, offnum); - btitem = (BTItem) PageGetItem(page, itemid); - itup = &(btitem->bti_itup); - blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); - - /* - * We need to save the bit image of the index entry we chose in the - * parent page on a stack. In case we split the tree, we'll use this - * bit image to figure out what our real parent page is, in case the - * parent splits while we're working lower in the tree. See the paper - * by Lehman and Yao for how this is detected and handled. (We use - * unique OIDs to disambiguate duplicate keys in the index -- Lehman - * and Yao disallow duplicate keys). - */ - - item_nbytes = ItemIdGetLength(itemid); - item_save = (BTItem) palloc(item_nbytes); - memmove((char *) item_save, (char *) btitem, item_nbytes); - stack = (BTStack) palloc(sizeof(BTStackData)); - stack->bts_blkno = par_blkno; - stack->bts_offset = offnum; - stack->bts_btitem = item_save; - stack->bts_parent = stack_in; - - /* drop the read lock on the parent page and acquire one on the child */ - _bt_relbuf(rel, *bufP, BT_READ); - *bufP = _bt_getbuf(rel, blkno, BT_READ); - - /* - * Race -- the page we just grabbed may have split since we read its - * pointer in the parent. If it has, we may need to move right to its - * new sibling. Do that. - */ - - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); - - /* okay, all set to move down a level */ - return (_bt_searchr(rel, keysz, scankey, bufP, stack)); + BTStack stack; + OffsetNumber offnum; + Page page; + BTPageOpaque opaque; + BlockNumber par_blkno; + BlockNumber blkno; + ItemId itemid; + BTItem btitem; + BTItem item_save; + int item_nbytes; + IndexTuple itup; + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (opaque->btpo_flags & BTP_LEAF) + return (stack_in); + + /* + * Find the appropriate item on the internal page, and get the child + * page that it points to. + */ + + par_blkno = BufferGetBlockNumber(*bufP); + offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT); + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + itup = &(btitem->bti_itup); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + + /* + * We need to save the bit image of the index entry we chose in the + * parent page on a stack. In case we split the tree, we'll use this + * bit image to figure out what our real parent page is, in case the + * parent splits while we're working lower in the tree. See the paper + * by Lehman and Yao for how this is detected and handled. (We use + * unique OIDs to disambiguate duplicate keys in the index -- Lehman + * and Yao disallow duplicate keys). + */ + + item_nbytes = ItemIdGetLength(itemid); + item_save = (BTItem) palloc(item_nbytes); + memmove((char *) item_save, (char *) btitem, item_nbytes); + stack = (BTStack) palloc(sizeof(BTStackData)); + stack->bts_blkno = par_blkno; + stack->bts_offset = offnum; + stack->bts_btitem = item_save; + stack->bts_parent = stack_in; + + /* drop the read lock on the parent page and acquire one on the child */ + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + + /* + * Race -- the page we just grabbed may have split since we read its + * pointer in the parent. If it has, we may need to move right to its + * new sibling. Do that. + */ + + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); + + /* okay, all set to move down a level */ + return (_bt_searchr(rel, keysz, scankey, bufP, stack)); } /* - * _bt_moveright() -- move right in the btree if necessary. + * _bt_moveright() -- move right in the btree if necessary. * - * When we drop and reacquire a pointer to a page, it is possible that - * the page has changed in the meanwhile. If this happens, we're - * guaranteed that the page has "split right" -- that is, that any - * data that appeared on the page originally is either on the page - * or strictly to the right of it. + * When we drop and reacquire a pointer to a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. * - * This routine decides whether or not we need to move right in the - * tree by examining the high key entry on the page. If that entry - * is strictly less than one we expect to be on the page, then our - * picture of the page is incorrect and we need to move right. + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry + * is strictly less than one we expect to be on the page, then our + * picture of the page is incorrect and we need to move right. * - * On entry, we have the buffer pinned and a lock of the proper type. - * If we move right, we release the buffer and lock and acquire the - * same on the right sibling. + * On entry, we have the buffer pinned and a lock of the proper type. + * If we move right, we release the buffer and lock and acquire the + * same on the right sibling. */ Buffer _bt_moveright(Relation rel, - Buffer buf, - int keysz, - ScanKey scankey, - int access) + Buffer buf, + int keysz, + ScanKey scankey, + int access) { - Page page; - BTPageOpaque opaque; - ItemId hikey; - BlockNumber rblkno; - int natts = rel->rd_rel->relnatts; - - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* if we're on a rightmost page, we don't need to move right */ - if (P_RIGHTMOST(opaque)) - return (buf); - - /* by convention, item 0 on non-rightmost pages is the high key */ - hikey = PageGetItemId(page, P_HIKEY); - - /* - * If the scan key that brought us to this page is >= the high key - * stored on the page, then the page has split and we need to move - * right. - */ - - if (_bt_skeycmp(rel, keysz, scankey, page, hikey, - BTGreaterEqualStrategyNumber)) - { - /* move right as long as we need to */ - do + Page page; + BTPageOpaque opaque; + ItemId hikey; + BlockNumber rblkno; + int natts = rel->rd_rel->relnatts; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* if we're on a rightmost page, we don't need to move right */ + if (P_RIGHTMOST(opaque)) + return (buf); + + /* by convention, item 0 on non-rightmost pages is the high key */ + hikey = PageGetItemId(page, P_HIKEY); + + /* + * If the scan key that brought us to this page is >= the high key + * stored on the page, then the page has split and we need to move + * right. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, + BTGreaterEqualStrategyNumber)) { - OffsetNumber offmax = PageGetMaxOffsetNumber(page); - /* - * If this page consists of all duplicate keys (hikey and first - * key on the page have the same value), then we don't need to - * step right. - * - * NOTE for multi-column indices: we may do scan using - * keys not for all attrs. But we handle duplicates - * using all attrs in _bt_insert/_bt_spool code. - * And so we've to compare scankey with _last_ item - * on this page to do not lose "good" tuples if number - * of attrs > keysize. Example: (2,0) - last items on - * this page, (2,1) - first item on next page (hikey), - * our scankey is x = 2. Scankey == (2,1) because of - * we compare first attrs only, but we shouldn't to move - * right of here. - vadim 04/15/97 - */ - - if ( _bt_skeycmp (rel, keysz, scankey, page, hikey, - BTEqualStrategyNumber) ) - { - if ( opaque->btpo_flags & BTP_CHAIN ) - { - Assert ( ( opaque->btpo_flags & BTP_LEAF ) || offmax > P_HIKEY ); - break; - } - if ( offmax > P_HIKEY ) - { - if ( natts == keysz ) /* sanity checks */ - { - if ( _bt_skeycmp (rel, keysz, scankey, page, - PageGetItemId (page, P_FIRSTKEY), - BTEqualStrategyNumber) ) - elog (FATAL, "btree: BTP_CHAIN flag was expected"); - if ( _bt_skeycmp (rel, keysz, scankey, page, - PageGetItemId (page, offmax), - BTEqualStrategyNumber) ) - elog (FATAL, "btree: unexpected equal last item"); - if ( _bt_skeycmp (rel, keysz, scankey, page, - PageGetItemId (page, offmax), - BTLessStrategyNumber) ) - elog (FATAL, "btree: unexpected greater last item"); - /* move right */ - } - else if ( _bt_skeycmp (rel, keysz, scankey, page, - PageGetItemId (page, offmax), - BTLessEqualStrategyNumber) ) - break; - } - } - - /* step right one page */ - rblkno = opaque->btpo_next; - _bt_relbuf(rel, buf, access); - buf = _bt_getbuf(rel, rblkno, access); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - hikey = PageGetItemId(page, P_HIKEY); - - } while (! P_RIGHTMOST(opaque) - && _bt_skeycmp(rel, keysz, scankey, page, hikey, - BTGreaterEqualStrategyNumber)); - } - return (buf); + /* move right as long as we need to */ + do + { + OffsetNumber offmax = PageGetMaxOffsetNumber(page); + + /* + * If this page consists of all duplicate keys (hikey and + * first key on the page have the same value), then we don't + * need to step right. + * + * NOTE for multi-column indices: we may do scan using keys not + * for all attrs. But we handle duplicates using all attrs in + * _bt_insert/_bt_spool code. And so we've to compare scankey + * with _last_ item on this page to do not lose "good" tuples + * if number of attrs > keysize. Example: (2,0) - last items + * on this page, (2,1) - first item on next page (hikey), our + * scankey is x = 2. Scankey == (2,1) because of we compare + * first attrs only, but we shouldn't to move right of here. + * - vadim 04/15/97 + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, hikey, + BTEqualStrategyNumber)) + { + if (opaque->btpo_flags & BTP_CHAIN) + { + Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY); + break; + } + if (offmax > P_HIKEY) + { + if (natts == keysz) /* sanity checks */ + { + if (_bt_skeycmp(rel, keysz, scankey, page, + PageGetItemId(page, P_FIRSTKEY), + BTEqualStrategyNumber)) + elog(FATAL, "btree: BTP_CHAIN flag was expected"); + if (_bt_skeycmp(rel, keysz, scankey, page, + PageGetItemId(page, offmax), + BTEqualStrategyNumber)) + elog(FATAL, "btree: unexpected equal last item"); + if (_bt_skeycmp(rel, keysz, scankey, page, + PageGetItemId(page, offmax), + BTLessStrategyNumber)) + elog(FATAL, "btree: unexpected greater last item"); + /* move right */ + } + else if (_bt_skeycmp(rel, keysz, scankey, page, + PageGetItemId(page, offmax), + BTLessEqualStrategyNumber)) + break; + } + } + + /* step right one page */ + rblkno = opaque->btpo_next; + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, rblkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + hikey = PageGetItemId(page, P_HIKEY); + + } while (!P_RIGHTMOST(opaque) + && _bt_skeycmp(rel, keysz, scankey, page, hikey, + BTGreaterEqualStrategyNumber)); + } + return (buf); } /* - * _bt_skeycmp() -- compare a scan key to a particular item on a page using - * a requested strategy (<, <=, =, >=, >). + * _bt_skeycmp() -- compare a scan key to a particular item on a page using + * a requested strategy (<, <=, =, >=, >). * - * We ignore the unique OIDs stored in the btree item here. Those - * numbers are intended for use internally only, in repositioning a - * scan after a page split. They do not impose any meaningful ordering. + * We ignore the unique OIDs stored in the btree item here. Those + * numbers are intended for use internally only, in repositioning a + * scan after a page split. They do not impose any meaningful ordering. * - * The comparison is A <op> B, where A is the scan key and B is the - * tuple pointed at by itemid on page. + * The comparison is A <op> B, where A is the scan key and B is the + * tuple pointed at by itemid on page. */ bool _bt_skeycmp(Relation rel, - Size keysz, - ScanKey scankey, - Page page, - ItemId itemid, - StrategyNumber strat) + Size keysz, + ScanKey scankey, + Page page, + ItemId itemid, + StrategyNumber strat) { - BTItem item; - IndexTuple indexTuple; - TupleDesc tupDes; - ScanKey entry; - int i; - Datum attrDatum; - Datum keyDatum; - bool compare; - bool isNull; - bool useEqual = false; - bool keyNull; - - if ( strat == BTLessEqualStrategyNumber ) - { - useEqual = true; - strat = BTLessStrategyNumber; - } - else if ( strat == BTGreaterEqualStrategyNumber ) - { - useEqual = true; - strat = BTGreaterStrategyNumber; - } - - item = (BTItem) PageGetItem(page, itemid); - indexTuple = &(item->bti_itup); - - tupDes = RelationGetTupleDescriptor(rel); - - /* see if the comparison is true for all of the key attributes */ - for (i=1; i <= keysz; i++) { - - entry = &scankey[i-1]; - Assert ( entry->sk_attno == i ); - attrDatum = index_getattr(indexTuple, - entry->sk_attno, - tupDes, - &isNull); - keyDatum = entry->sk_argument; - - /* see comments about NULLs handling in btbuild */ - if ( entry->sk_flags & SK_ISNULL ) /* key is NULL */ + BTItem item; + IndexTuple indexTuple; + TupleDesc tupDes; + ScanKey entry; + int i; + Datum attrDatum; + Datum keyDatum; + bool compare; + bool isNull; + bool useEqual = false; + bool keyNull; + + if (strat == BTLessEqualStrategyNumber) { - Assert ( entry->sk_procedure == NullValueRegProcedure ); - keyNull = true; - if ( isNull ) - compare = ( strat == BTEqualStrategyNumber ) ? true : false; - else - compare = ( strat == BTGreaterStrategyNumber ) ? true : false; - } - else if ( isNull ) /* key is NOT_NULL and item is NULL */ - { - keyNull = false; - compare = ( strat == BTLessStrategyNumber ) ? true : false; - } - else - { - keyNull = false; - compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum); + useEqual = true; + strat = BTLessStrategyNumber; } - - if ( compare ) /* true for one of ">, <, =" */ + else if (strat == BTGreaterEqualStrategyNumber) { - if ( strat != BTEqualStrategyNumber ) - return (true); + useEqual = true; + strat = BTGreaterStrategyNumber; } - else /* false for one of ">, <, =" */ + + item = (BTItem) PageGetItem(page, itemid); + indexTuple = &(item->bti_itup); + + tupDes = RelationGetTupleDescriptor(rel); + + /* see if the comparison is true for all of the key attributes */ + for (i = 1; i <= keysz; i++) { - if ( strat == BTEqualStrategyNumber ) - return (false); - /* - * if original strat was "<=, >=" OR - * "<, >" but some attribute(s) left - * - need to test for Equality - */ - if ( useEqual || i < keysz ) - { - if ( keyNull || isNull ) - compare = ( keyNull && isNull ) ? true : false; - else - compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber, - keyDatum, attrDatum); - if ( compare ) /* key' and item' attributes are equal */ - continue; /* - try to compare next attributes */ - } - return (false); + + entry = &scankey[i - 1]; + Assert(entry->sk_attno == i); + attrDatum = index_getattr(indexTuple, + entry->sk_attno, + tupDes, + &isNull); + keyDatum = entry->sk_argument; + + /* see comments about NULLs handling in btbuild */ + if (entry->sk_flags & SK_ISNULL) /* key is NULL */ + { + Assert(entry->sk_procedure == NullValueRegProcedure); + keyNull = true; + if (isNull) + compare = (strat == BTEqualStrategyNumber) ? true : false; + else + compare = (strat == BTGreaterStrategyNumber) ? true : false; + } + else if (isNull) /* key is NOT_NULL and item is NULL */ + { + keyNull = false; + compare = (strat == BTLessStrategyNumber) ? true : false; + } + else + { + keyNull = false; + compare = _bt_invokestrat(rel, i, strat, keyDatum, attrDatum); + } + + if (compare) /* true for one of ">, <, =" */ + { + if (strat != BTEqualStrategyNumber) + return (true); + } + else +/* false for one of ">, <, =" */ + { + if (strat == BTEqualStrategyNumber) + return (false); + + /* + * if original strat was "<=, >=" OR "<, >" but some + * attribute(s) left - need to test for Equality + */ + if (useEqual || i < keysz) + { + if (keyNull || isNull) + compare = (keyNull && isNull) ? true : false; + else + compare = _bt_invokestrat(rel, i, BTEqualStrategyNumber, + keyDatum, attrDatum); + if (compare) /* key' and item' attributes are equal */ + continue; /* - try to compare next attributes */ + } + return (false); + } } - } - - return (true); + + return (true); } /* - * _bt_binsrch() -- Do a binary search for a key on a particular page. + * _bt_binsrch() -- Do a binary search for a key on a particular page. * - * The scankey we get has the compare function stored in the procedure - * entry of each data struct. We invoke this regproc to do the - * comparison for every key in the scankey. _bt_binsrch() returns - * the OffsetNumber of the first matching key on the page, or the - * OffsetNumber at which the matching key would appear if it were - * on this page. + * The scankey we get has the compare function stored in the procedure + * entry of each data struct. We invoke this regproc to do the + * comparison for every key in the scankey. _bt_binsrch() returns + * the OffsetNumber of the first matching key on the page, or the + * OffsetNumber at which the matching key would appear if it were + * on this page. * - * By the time this procedure is called, we're sure we're looking - * at the right page -- don't need to walk right. _bt_binsrch() has - * no lock or refcount side effects on the buffer. + * By the time this procedure is called, we're sure we're looking + * at the right page -- don't need to walk right. _bt_binsrch() has + * no lock or refcount side effects on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, - Buffer buf, - int keysz, - ScanKey scankey, - int srchtype) + Buffer buf, + int keysz, + ScanKey scankey, + int srchtype) { - TupleDesc itupdesc; - Page page; - BTPageOpaque opaque; - OffsetNumber low, mid, high; - int natts = rel->rd_rel->relnatts; - int result; - - itupdesc = RelationGetTupleDescriptor(rel); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* by convention, item 1 on any non-rightmost page is the high key */ - low = mid = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - high = PageGetMaxOffsetNumber(page); - - /* - * Since for non-rightmost pages, the first item on the page is the - * high key, there are two notions of emptiness. One is if nothing - * appears on the page. The other is if nothing but the high key does. - * The reason we test high <= low, rather than high == low, is that - * after vacuuming there may be nothing *but* the high key on a page. - * In that case, given the scheme above, low = 2 and high = 1. - */ - - if ( PageIsEmpty (page) ) - return (low); - if ( (! P_RIGHTMOST(opaque) && high <= low)) - { - if ( high < low || - (srchtype == BT_DESCENT && !(opaque->btpo_flags & BTP_LEAF)) ) - return (low); - /* It's insertion and high == low == 2 */ - result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); - if ( result > 0 ) - return ( OffsetNumberNext (low) ); - return (low); - } - - while ((high - low) > 1) { - mid = low + ((high - low) / 2); - result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid); - - if (result > 0) - low = mid; - else if (result < 0) - high = mid - 1; - else + TupleDesc itupdesc; + Page page; + BTPageOpaque opaque; + OffsetNumber low, + mid, + high; + int natts = rel->rd_rel->relnatts; + int result; + + itupdesc = RelationGetTupleDescriptor(rel); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* by convention, item 1 on any non-rightmost page is the high key */ + low = mid = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + high = PageGetMaxOffsetNumber(page); + + /* + * Since for non-rightmost pages, the first item on the page is the + * high key, there are two notions of emptiness. One is if nothing + * appears on the page. The other is if nothing but the high key + * does. The reason we test high <= low, rather than high == low, is + * that after vacuuming there may be nothing *but* the high key on a + * page. In that case, given the scheme above, low = 2 and high = 1. + */ + + if (PageIsEmpty(page)) + return (low); + if ((!P_RIGHTMOST(opaque) && high <= low)) { - mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, mid); - /* - * NOTE for multi-column indices: we may do scan using - * keys not for all attrs. But we handle duplicates using - * all attrs in _bt_insert/_bt_spool code. And so while - * searching on internal pages having number of attrs > keysize - * we want to point at the last item < the scankey, not at the - * first item = the scankey (!!!), and let _bt_moveright - * decide later whether to move right or not (see comments and - * example there). Note also that INSERTions are not affected - * by this code (natts == keysz). - vadim 04/15/97 - */ - if ( natts == keysz || opaque->btpo_flags & BTP_LEAF ) - return (mid); - low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - if ( mid == low ) - return (mid); - return (OffsetNumberPrev(mid)); + if (high < low || + (srchtype == BT_DESCENT && !(opaque->btpo_flags & BTP_LEAF))) + return (low); + /* It's insertion and high == low == 2 */ + result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); + if (result > 0) + return (OffsetNumberNext(low)); + return (low); } - } - - /* - * We terminated because the endpoints got too close together. There - * are two cases to take care of. - * - * For non-insertion searches on internal pages, we want to point at - * the last key <, or first key =, the scankey on the page. This - * guarantees that we'll descend the tree correctly. - * (NOTE comments above for multi-column indices). - * - * For all other cases, we want to point at the first key >= - * the scankey on the page. This guarantees that scans and - * insertions will happen correctly. - */ - - if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) - { /* - * We want the last key <, or first key ==, the scan key. - */ - result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); - - if (result == 0) + + while ((high - low) > 1) { - mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, high); - /* - * If natts > keysz we want last item < the scan key. - * See comments above for multi-column indices. - */ - if ( natts == keysz ) - return (mid); - low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - if ( mid == low ) - return (mid); - return (OffsetNumberPrev(mid)); + mid = low + ((high - low) / 2); + result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid); + + if (result > 0) + low = mid; + else if (result < 0) + high = mid - 1; + else + { + mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, mid); + + /* + * NOTE for multi-column indices: we may do scan using keys + * not for all attrs. But we handle duplicates using all attrs + * in _bt_insert/_bt_spool code. And so while searching on + * internal pages having number of attrs > keysize we want to + * point at the last item < the scankey, not at the first item + * = the scankey (!!!), and let _bt_moveright decide later + * whether to move right or not (see comments and example + * there). Note also that INSERTions are not affected by this + * code (natts == keysz). - vadim 04/15/97 + */ + if (natts == keysz || opaque->btpo_flags & BTP_LEAF) + return (mid); + low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + if (mid == low) + return (mid); + return (OffsetNumberPrev(mid)); + } + } + + /* + * We terminated because the endpoints got too close together. There + * are two cases to take care of. + * + * For non-insertion searches on internal pages, we want to point at the + * last key <, or first key =, the scankey on the page. This + * guarantees that we'll descend the tree correctly. (NOTE comments + * above for multi-column indices). + * + * For all other cases, we want to point at the first key >= the scankey + * on the page. This guarantees that scans and insertions will happen + * correctly. + */ + + if (!(opaque->btpo_flags & BTP_LEAF) && srchtype == BT_DESCENT) + { /* We want the last key <, or first key + * ==, the scan key. */ + result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); + + if (result == 0) + { + mid = _bt_firsteq(rel, itupdesc, page, keysz, scankey, high); + + /* + * If natts > keysz we want last item < the scan key. See + * comments above for multi-column indices. + */ + if (natts == keysz) + return (mid); + low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + if (mid == low) + return (mid); + return (OffsetNumberPrev(mid)); + } + else if (result > 0) + return (high); + else + return (low); } - else if (result > 0) - return (high); - else - return (low); - } - else /* we want the first key >= the scan key */ - { - result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); - if (result <= 0) - return (low); else +/* we want the first key >= the scan key */ { - if (low == high) - return (OffsetNumberNext(low)); - - result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); - if (result <= 0) - return (high); - else - return (OffsetNumberNext(high)); + result = _bt_compare(rel, itupdesc, page, keysz, scankey, low); + if (result <= 0) + return (low); + else + { + if (low == high) + return (OffsetNumberNext(low)); + + result = _bt_compare(rel, itupdesc, page, keysz, scankey, high); + if (result <= 0) + return (high); + else + return (OffsetNumberNext(high)); + } } - } } -static OffsetNumber +static OffsetNumber _bt_firsteq(Relation rel, - TupleDesc itupdesc, - Page page, - Size keysz, - ScanKey scankey, - OffsetNumber offnum) + TupleDesc itupdesc, + Page page, + Size keysz, + ScanKey scankey, + OffsetNumber offnum) { - BTPageOpaque opaque; - OffsetNumber limit; - - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* skip the high key, if any */ - limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* walk backwards looking for the first key in the chain of duplicates */ - while (offnum > limit - && _bt_compare(rel, itupdesc, page, - keysz, scankey, OffsetNumberPrev(offnum)) == 0) { - offnum = OffsetNumberPrev(offnum); - } - - return (offnum); + BTPageOpaque opaque; + OffsetNumber limit; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* skip the high key, if any */ + limit = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* walk backwards looking for the first key in the chain of duplicates */ + while (offnum > limit + && _bt_compare(rel, itupdesc, page, + keysz, scankey, OffsetNumberPrev(offnum)) == 0) + { + offnum = OffsetNumberPrev(offnum); + } + + return (offnum); } /* - * _bt_compare() -- Compare scankey to a particular tuple on the page. + * _bt_compare() -- Compare scankey to a particular tuple on the page. * - * This routine returns: - * -1 if scankey < tuple at offnum; - * 0 if scankey == tuple at offnum; - * +1 if scankey > tuple at offnum. + * This routine returns: + * -1 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * +1 if scankey > tuple at offnum. * - * -- Old comments: - * In order to avoid having to propagate changes up the tree any time - * a new minimal key is inserted, the leftmost entry on the leftmost - * page is less than all possible keys, by definition. + * -- Old comments: + * In order to avoid having to propagate changes up the tree any time + * a new minimal key is inserted, the leftmost entry on the leftmost + * page is less than all possible keys, by definition. * - * -- New ones: - * New insertion code (fix against updating _in_place_ if new minimal - * key has bigger size than old one) may delete P_HIKEY entry on the - * root page in order to insert new minimal key - and so this definition - * does not work properly in this case and breaks key' order on root - * page. BTW, this propagation occures only while page' splitting, - * but not "any time a new min key is inserted" (see _bt_insertonpg). - * - vadim 12/05/96 + * -- New ones: + * New insertion code (fix against updating _in_place_ if new minimal + * key has bigger size than old one) may delete P_HIKEY entry on the + * root page in order to insert new minimal key - and so this definition + * does not work properly in this case and breaks key' order on root + * page. BTW, this propagation occures only while page' splitting, + * but not "any time a new min key is inserted" (see _bt_insertonpg). + * - vadim 12/05/96 */ static int _bt_compare(Relation rel, - TupleDesc itupdesc, - Page page, - int keysz, - ScanKey scankey, - OffsetNumber offnum) + TupleDesc itupdesc, + Page page, + int keysz, + ScanKey scankey, + OffsetNumber offnum) { - Datum datum; - BTItem btitem; - ItemId itemid; - IndexTuple itup; - BTPageOpaque opaque; - ScanKey entry; - AttrNumber attno; - int result; - int i; - bool null; - - /* - * If this is a leftmost internal page, and if our comparison is - * with the first key on the page, then the item at that position is - * by definition less than the scan key. - * - * - see new comments above... - */ - - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (!(opaque->btpo_flags & BTP_LEAF) - && P_LEFTMOST(opaque) - && offnum == P_HIKEY) { - itemid = PageGetItemId(page, offnum); - + Datum datum; + BTItem btitem; + ItemId itemid; + IndexTuple itup; + BTPageOpaque opaque; + ScanKey entry; + AttrNumber attno; + int result; + int i; + bool null; + /* - * we just have to believe that this will only be called with - * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the - * first actual data key (i.e., this is also a rightmost - * page). there doesn't seem to be any code that implies - * that the leftmost page is normally missing a high key as - * well as the rightmost page. but that implies that this - * code path only applies to the root -- which seems - * unlikely.. + * If this is a leftmost internal page, and if our comparison is with + * the first key on the page, then the item at that position is by + * definition less than the scan key. * - * - see new comments above... + * - see new comments above... */ - if (! P_RIGHTMOST(opaque)) { - elog(WARN, "_bt_compare: invalid comparison to high key"); - } + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!(opaque->btpo_flags & BTP_LEAF) + && P_LEFTMOST(opaque) + && offnum == P_HIKEY) + { + itemid = PageGetItemId(page, offnum); + + /* + * we just have to believe that this will only be called with + * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the first + * actual data key (i.e., this is also a rightmost page). there + * doesn't seem to be any code that implies that the leftmost page + * is normally missing a high key as well as the rightmost page. + * but that implies that this code path only applies to the root + * -- which seems unlikely.. + * + * - see new comments above... + */ + if (!P_RIGHTMOST(opaque)) + { + elog(WARN, "_bt_compare: invalid comparison to high key"); + } #if 0 + + /* + * We just have to belive that right answer will not break + * anything. I've checked code and all seems to be ok. See new + * comments above... + * + * -- Old comments If the item on the page is equal to the scankey, + * that's okay to admit. We just can't claim that the first key + * on the page is greater than anything. + */ + + if (_bt_skeycmp(rel, keysz, scankey, page, itemid, + BTEqualStrategyNumber)) + { + return (0); + } + return (1); +#endif + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + /* - * We just have to belive that right answer will not - * break anything. I've checked code and all seems to be ok. - * See new comments above... + * The scan key is set up with the attribute number associated with + * each term in the key. It is important that, if the index is + * multi-key, the scan contain the first k key attributes, and that + * they be in order. If you think about how multi-key ordering works, + * you'll understand why this is. * - * -- Old comments - * If the item on the page is equal to the scankey, that's - * okay to admit. We just can't claim that the first key on - * the page is greater than anything. + * We don't test for violation of this condition here. */ - - if (_bt_skeycmp(rel, keysz, scankey, page, itemid, - BTEqualStrategyNumber)) { - return (0); - } - return (1); -#endif - } - - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &(btitem->bti_itup); - - /* - * The scan key is set up with the attribute number associated with each - * term in the key. It is important that, if the index is multi-key, - * the scan contain the first k key attributes, and that they be in - * order. If you think about how multi-key ordering works, you'll - * understand why this is. - * - * We don't test for violation of this condition here. - */ - - for (i = 1; i <= keysz; i++) { - long tmpres; - - entry = &scankey[i - 1]; - attno = entry->sk_attno; - datum = index_getattr(itup, attno, itupdesc, &null); - - /* see comments about NULLs handling in btbuild */ - if ( entry->sk_flags & SK_ISNULL ) /* key is NULL */ + + for (i = 1; i <= keysz; i++) { - Assert ( entry->sk_procedure == NullValueRegProcedure ); - if ( null ) - tmpres = (long) 0; /* NULL "=" NULL */ - else - tmpres = (long) 1; /* NULL ">" NOT_NULL */ - } - else if ( null ) /* key is NOT_NULL and item is NULL */ - { - tmpres = (long) -1; /* NOT_NULL "<" NULL */ - } - else - { - tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, - entry->sk_argument, datum); + long tmpres; + + entry = &scankey[i - 1]; + attno = entry->sk_attno; + datum = index_getattr(itup, attno, itupdesc, &null); + + /* see comments about NULLs handling in btbuild */ + if (entry->sk_flags & SK_ISNULL) /* key is NULL */ + { + Assert(entry->sk_procedure == NullValueRegProcedure); + if (null) + tmpres = (long) 0; /* NULL "=" NULL */ + else + tmpres = (long) 1; /* NULL ">" NOT_NULL */ + } + else if (null) /* key is NOT_NULL and item is NULL */ + { + tmpres = (long) -1; /* NOT_NULL "<" NULL */ + } + else + { + tmpres = (long) FMGR_PTR2(entry->sk_func, entry->sk_procedure, + entry->sk_argument, datum); + } + result = tmpres; + + /* if the keys are unequal, return the difference */ + if (result != 0) + return (result); } - result = tmpres; - - /* if the keys are unequal, return the difference */ - if (result != 0) - return (result); - } - - /* by here, the keys are equal */ - return (0); + + /* by here, the keys are equal */ + return (0); } /* - * _bt_next() -- Get the next item in a scan. + * _bt_next() -- Get the next item in a scan. * - * On entry, we have a valid currentItemData in the scan, and a - * read lock on the page that contains that item. We do not have - * the page pinned. We return the next item in the scan. On - * exit, we have the page containing the next item locked but not - * pinned. + * On entry, we have a valid currentItemData in the scan, and a + * read lock on the page that contains that item. We do not have + * the page pinned. We return the next item in the scan. On + * exit, we have the page containing the next item locked but not + * pinned. */ RetrieveIndexResult _bt_next(IndexScanDesc scan, ScanDirection dir) { - Relation rel; - Buffer buf; - Page page; - OffsetNumber offnum; - RetrieveIndexResult res; - ItemPointer current; - BTItem btitem; - IndexTuple itup; - BTScanOpaque so; - Size keysok; - - rel = scan->relation; - so = (BTScanOpaque) scan->opaque; - current = &(scan->currentItemData); - - /* - * XXX 10 may 91: somewhere there's a bug in our management of the - * cached buffer for this scan. wei discovered it. the following - * is a workaround so he can work until i figure out what's going on. - */ - - if (!BufferIsValid(so->btso_curbuf)) - so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current), - BT_READ); - - /* we still have the buffer pinned and locked */ - buf = so->btso_curbuf; - - do - { - /* step one tuple in the appropriate direction */ - if (!_bt_step(scan, &buf, dir)) - return ((RetrieveIndexResult) NULL); - - /* by here, current is the tuple we want to return */ - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &btitem->bti_itup; - - if ( _bt_checkkeys (scan, itup, &keysok) ) - { - Assert (keysok == so->numberOfKeys); - res = FormRetrieveIndexResult(current, &(itup->t_tid)); - - /* remember which buffer we have pinned and locked */ - so->btso_curbuf = buf; - return (res); - } + Relation rel; + Buffer buf; + Page page; + OffsetNumber offnum; + RetrieveIndexResult res; + ItemPointer current; + BTItem btitem; + IndexTuple itup; + BTScanOpaque so; + Size keysok; + + rel = scan->relation; + so = (BTScanOpaque) scan->opaque; + current = &(scan->currentItemData); + + /* + * XXX 10 may 91: somewhere there's a bug in our management of the + * cached buffer for this scan. wei discovered it. the following is + * a workaround so he can work until i figure out what's going on. + */ + + if (!BufferIsValid(so->btso_curbuf)) + so->btso_curbuf = _bt_getbuf(rel, ItemPointerGetBlockNumber(current), + BT_READ); + + /* we still have the buffer pinned and locked */ + buf = so->btso_curbuf; + + do + { + /* step one tuple in the appropriate direction */ + if (!_bt_step(scan, &buf, dir)) + return ((RetrieveIndexResult) NULL); - } while ( keysok >= so->numberOfFirstKeys ); + /* by here, current is the tuple we want to return */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + + if (_bt_checkkeys(scan, itup, &keysok)) + { + Assert(keysok == so->numberOfKeys); + res = FormRetrieveIndexResult(current, &(itup->t_tid)); + + /* remember which buffer we have pinned and locked */ + so->btso_curbuf = buf; + return (res); + } + + } while (keysok >= so->numberOfFirstKeys); - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); - - return ((RetrieveIndexResult) NULL); + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + + return ((RetrieveIndexResult) NULL); } /* - * _bt_first() -- Find the first item in a scan. + * _bt_first() -- Find the first item in a scan. * - * We need to be clever about the type of scan, the operation it's - * performing, and the tree ordering. We return the RetrieveIndexResult - * of the first item in the tree that satisfies the qualification - * associated with the scan descriptor. On exit, the page containing - * the current index tuple is read locked and pinned, and the scan's - * opaque data entry is updated to include the buffer. + * We need to be clever about the type of scan, the operation it's + * performing, and the tree ordering. We return the RetrieveIndexResult + * of the first item in the tree that satisfies the qualification + * associated with the scan descriptor. On exit, the page containing + * the current index tuple is read locked and pinned, and the scan's + * opaque data entry is updated to include the buffer. */ RetrieveIndexResult _bt_first(IndexScanDesc scan, ScanDirection dir) { - Relation rel; - TupleDesc itupdesc; - Buffer buf; - Page page; - BTPageOpaque pop; - BTStack stack; - OffsetNumber offnum, maxoff; - bool offGmax = false; - BTItem btitem; - IndexTuple itup; - ItemPointer current; - BlockNumber blkno; - StrategyNumber strat; - RetrieveIndexResult res; - RegProcedure proc; - int result; - BTScanOpaque so; - ScanKeyData skdata; - Size keysok; - - rel = scan->relation; - so = (BTScanOpaque) scan->opaque; - - /* - * Order the keys in the qualification and be sure - * that the scan exploits the tree order. - */ - so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */ - so->qual_ok = 1; /* may be changed by _bt_orderkeys */ - scan->scanFromEnd = false; - if ( so->numberOfKeys > 0 ) - { - _bt_orderkeys(rel, so); - - strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure); + Relation rel; + TupleDesc itupdesc; + Buffer buf; + Page page; + BTPageOpaque pop; + BTStack stack; + OffsetNumber offnum, + maxoff; + bool offGmax = false; + BTItem btitem; + IndexTuple itup; + ItemPointer current; + BlockNumber blkno; + StrategyNumber strat; + RetrieveIndexResult res; + RegProcedure proc; + int result; + BTScanOpaque so; + ScanKeyData skdata; + Size keysok; - /* NOTE: it assumes ForwardScanDirection */ - if ( strat == BTLessStrategyNumber || - strat == BTLessEqualStrategyNumber ) - scan->scanFromEnd = true; - } - else - scan->scanFromEnd = true; - - if ( so->qual_ok == 0 ) - return ((RetrieveIndexResult) NULL); - - /* if we just need to walk down one edge of the tree, do that */ - if (scan->scanFromEnd) - return (_bt_endpoint(scan, dir)); - - itupdesc = RelationGetTupleDescriptor(rel); - current = &(scan->currentItemData); - - /* - * Okay, we want something more complicated. What we'll do is use - * the first item in the scan key passed in (which has been correctly - * ordered to take advantage of index ordering) to position ourselves - * at the right place in the scan. - */ - /* _bt_orderkeys disallows it, but it's place to add some code latter */ - if ( so->keyData[0].sk_flags & SK_ISNULL ) - { - elog (WARN, "_bt_first: btree doesn't support is(not)null, yet"); - return ((RetrieveIndexResult) NULL); - } - proc = index_getprocid(rel, 1, BTORDER_PROC); - ScanKeyEntryInitialize(&skdata, so->keyData[0].sk_flags, 1, proc, - so->keyData[0].sk_argument); - - stack = _bt_search(rel, 1, &skdata, &buf); - _bt_freestack(stack); - - blkno = BufferGetBlockNumber(buf); - page = BufferGetPage(buf); - - /* - * This will happen if the tree we're searching is entirely empty, - * or if we're doing a search for a key that would appear on an - * entirely empty internal page. In either case, there are no - * matching tuples in the index. - */ - - if (PageIsEmpty(page)) { - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); - return ((RetrieveIndexResult) NULL); - } - maxoff = PageGetMaxOffsetNumber(page); - pop = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * Now _bt_moveright doesn't move from non-rightmost leaf page - * if scankey == hikey and there is only hikey there. It's - * good for insertion, but we need to do work for scan here. - * - vadim 05/27/97 - */ - - while ( maxoff == P_HIKEY && !P_RIGHTMOST(pop) && - _bt_skeycmp(rel, 1, &skdata, page, - PageGetItemId(page, P_HIKEY), - BTGreaterEqualStrategyNumber) ) - { - /* step right one page */ - blkno = pop->btpo_next; - _bt_relbuf(rel, buf, BT_READ); - buf = _bt_getbuf(rel, blkno, BT_READ); + rel = scan->relation; + so = (BTScanOpaque) scan->opaque; + + /* + * Order the keys in the qualification and be sure that the scan + * exploits the tree order. + */ + so->numberOfFirstKeys = 0; /* may be changed by _bt_orderkeys */ + so->qual_ok = 1; /* may be changed by _bt_orderkeys */ + scan->scanFromEnd = false; + if (so->numberOfKeys > 0) + { + _bt_orderkeys(rel, so); + + strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure); + + /* NOTE: it assumes ForwardScanDirection */ + if (strat == BTLessStrategyNumber || + strat == BTLessEqualStrategyNumber) + scan->scanFromEnd = true; + } + else + scan->scanFromEnd = true; + + if (so->qual_ok == 0) + return ((RetrieveIndexResult) NULL); + + /* if we just need to walk down one edge of the tree, do that */ + if (scan->scanFromEnd) + return (_bt_endpoint(scan, dir)); + + itupdesc = RelationGetTupleDescriptor(rel); + current = &(scan->currentItemData); + + /* + * Okay, we want something more complicated. What we'll do is use the + * first item in the scan key passed in (which has been correctly + * ordered to take advantage of index ordering) to position ourselves + * at the right place in the scan. + */ + /* _bt_orderkeys disallows it, but it's place to add some code latter */ + if (so->keyData[0].sk_flags & SK_ISNULL) + { + elog(WARN, "_bt_first: btree doesn't support is(not)null, yet"); + return ((RetrieveIndexResult) NULL); + } + proc = index_getprocid(rel, 1, BTORDER_PROC); + ScanKeyEntryInitialize(&skdata, so->keyData[0].sk_flags, 1, proc, + so->keyData[0].sk_argument); + + stack = _bt_search(rel, 1, &skdata, &buf); + _bt_freestack(stack); + + blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); - if (PageIsEmpty(page)) { - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); - return ((RetrieveIndexResult) NULL); + + /* + * This will happen if the tree we're searching is entirely empty, or + * if we're doing a search for a key that would appear on an entirely + * empty internal page. In either case, there are no matching tuples + * in the index. + */ + + if (PageIsEmpty(page)) + { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + return ((RetrieveIndexResult) NULL); } - maxoff = PageGetMaxOffsetNumber(page); + maxoff = PageGetMaxOffsetNumber(page); pop = (BTPageOpaque) PageGetSpecialPointer(page); - } - - - /* find the nearest match to the manufactured scan key on the page */ - offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT); - - if (offnum > maxoff) - { - offnum = maxoff; - offGmax = true; - } - - ItemPointerSet(current, blkno, offnum); - - /* - * Now find the right place to start the scan. Result is the - * value we're looking for minus the value we're looking at - * in the index. - */ - - result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); - - /* it's yet other place to add some code latter for is(not)null */ - - strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure); - - switch (strat) { - case BTLessStrategyNumber: - if (result <= 0) { - do { - if (!_bt_twostep(scan, &buf, BackwardScanDirection)) - break; - - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); - } while (result <= 0); - - /* if this is true, the key we just looked at is gone */ - if (result > 0) - _bt_twostep(scan, &buf, ForwardScanDirection); - } - break; - - case BTLessEqualStrategyNumber: - if (result >= 0) { - do { - if (!_bt_twostep(scan, &buf, ForwardScanDirection)) - break; - - offnum = ItemPointerGetOffsetNumber(current); + + /* + * Now _bt_moveright doesn't move from non-rightmost leaf page if + * scankey == hikey and there is only hikey there. It's good for + * insertion, but we need to do work for scan here. - vadim 05/27/97 + */ + + while (maxoff == P_HIKEY && !P_RIGHTMOST(pop) && + _bt_skeycmp(rel, 1, &skdata, page, + PageGetItemId(page, P_HIKEY), + BTGreaterEqualStrategyNumber)) + { + /* step right one page */ + blkno = pop->btpo_next; + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); - } while (result >= 0); - - if (result < 0) - _bt_twostep(scan, &buf, BackwardScanDirection); + if (PageIsEmpty(page)) + { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + return ((RetrieveIndexResult) NULL); + } + maxoff = PageGetMaxOffsetNumber(page); + pop = (BTPageOpaque) PageGetSpecialPointer(page); } - break; - - case BTEqualStrategyNumber: - if (result != 0) { - _bt_relbuf(scan->relation, buf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(&(scan->currentItemData)); - return ((RetrieveIndexResult) NULL); + + + /* find the nearest match to the manufactured scan key on the page */ + offnum = _bt_binsrch(rel, buf, 1, &skdata, BT_DESCENT); + + if (offnum > maxoff) + { + offnum = maxoff; + offGmax = true; } - break; - - case BTGreaterEqualStrategyNumber: - if ( offGmax ) + + ItemPointerSet(current, blkno, offnum); + + /* + * Now find the right place to start the scan. Result is the value + * we're looking for minus the value we're looking at in the index. + */ + + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + + /* it's yet other place to add some code latter for is(not)null */ + + strat = _bt_getstrat(rel, 1, so->keyData[0].sk_procedure); + + switch (strat) { - if (result < 0) - { - Assert ( !P_RIGHTMOST(pop) && maxoff == P_HIKEY ); - if ( !_bt_step(scan, &buf, ForwardScanDirection) ) - { - _bt_relbuf(scan->relation, buf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(&(scan->currentItemData)); - return ((RetrieveIndexResult) NULL); + case BTLessStrategyNumber: + if (result <= 0) + { + do + { + if (!_bt_twostep(scan, &buf, BackwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result <= 0); + + /* if this is true, the key we just looked at is gone */ + if (result > 0) + _bt_twostep(scan, &buf, ForwardScanDirection); } - } - else if (result > 0) - { /* - * Just remember: _bt_binsrch() returns the OffsetNumber of - * the first matching key on the page, or the OffsetNumber at - * which the matching key WOULD APPEAR IF IT WERE on this page. - * No key on this page, but offnum from _bt_binsrch() greater - * maxoff - have to move right. - vadim 12/06/96 - */ - _bt_twostep(scan, &buf, ForwardScanDirection); - } + break; + + case BTLessEqualStrategyNumber: + if (result >= 0) + { + do + { + if (!_bt_twostep(scan, &buf, ForwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result >= 0); + + if (result < 0) + _bt_twostep(scan, &buf, BackwardScanDirection); + } + break; + + case BTEqualStrategyNumber: + if (result != 0) + { + _bt_relbuf(scan->relation, buf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(&(scan->currentItemData)); + return ((RetrieveIndexResult) NULL); + } + break; + + case BTGreaterEqualStrategyNumber: + if (offGmax) + { + if (result < 0) + { + Assert(!P_RIGHTMOST(pop) && maxoff == P_HIKEY); + if (!_bt_step(scan, &buf, ForwardScanDirection)) + { + _bt_relbuf(scan->relation, buf, BT_READ); + so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(&(scan->currentItemData)); + return ((RetrieveIndexResult) NULL); + } + } + else if (result > 0) + { /* Just remember: _bt_binsrch() returns + * the OffsetNumber of the first matching + * key on the page, or the OffsetNumber at + * which the matching key WOULD APPEAR IF + * IT WERE on this page. No key on this + * page, but offnum from _bt_binsrch() + * greater maxoff - have to move right. - + * vadim 12/06/96 */ + _bt_twostep(scan, &buf, ForwardScanDirection); + } + } + else if (result < 0) + { + do + { + if (!_bt_twostep(scan, &buf, BackwardScanDirection)) + break; + + page = BufferGetPage(buf); + offnum = ItemPointerGetOffsetNumber(current); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result < 0); + + if (result > 0) + _bt_twostep(scan, &buf, ForwardScanDirection); + } + break; + + case BTGreaterStrategyNumber: + /* offGmax helps as above */ + if (result >= 0 || offGmax) + { + do + { + if (!_bt_twostep(scan, &buf, ForwardScanDirection)) + break; + + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); + } while (result >= 0); + } + break; } - else if (result < 0) + + /* okay, current item pointer for the scan is right */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + + if (_bt_checkkeys(scan, itup, &keysok)) { - do { - if (!_bt_twostep(scan, &buf, BackwardScanDirection)) - break; - - page = BufferGetPage(buf); - offnum = ItemPointerGetOffsetNumber(current); - result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); - } while (result < 0); - - if (result > 0) - _bt_twostep(scan, &buf, ForwardScanDirection); + res = FormRetrieveIndexResult(current, &(itup->t_tid)); + + /* remember which buffer we have pinned */ + so->btso_curbuf = buf; } - break; - - case BTGreaterStrategyNumber: - /* offGmax helps as above */ - if (result >= 0 || offGmax) { - do { - if (!_bt_twostep(scan, &buf, ForwardScanDirection)) - break; - - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, 1, &skdata, offnum); - } while (result >= 0); + else if (keysok >= so->numberOfFirstKeys) + { + so->btso_curbuf = buf; + return (_bt_next(scan, dir)); } - break; - } - - /* okay, current item pointer for the scan is right */ - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &btitem->bti_itup; - - if ( _bt_checkkeys (scan, itup, &keysok) ) - { - res = FormRetrieveIndexResult(current, &(itup->t_tid)); - - /* remember which buffer we have pinned */ - so->btso_curbuf = buf; - } - else if ( keysok >= so->numberOfFirstKeys ) - { - so->btso_curbuf = buf; - return (_bt_next (scan, dir)); - } - else - { - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); - res = (RetrieveIndexResult) NULL; - } - - return (res); + else + { + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; + } + + return (res); } /* - * _bt_step() -- Step one item in the requested direction in a scan on - * the tree. + * _bt_step() -- Step one item in the requested direction in a scan on + * the tree. * - * If no adjacent record exists in the requested direction, return - * false. Else, return true and set the currentItemData for the - * scan to the right thing. + * If no adjacent record exists in the requested direction, return + * false. Else, return true and set the currentItemData for the + * scan to the right thing. */ bool -_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +_bt_step(IndexScanDesc scan, Buffer * bufP, ScanDirection dir) { - Page page; - BTPageOpaque opaque; - OffsetNumber offnum, maxoff; - OffsetNumber start; - BlockNumber blkno; - BlockNumber obknum; - BTScanOpaque so; - ItemPointer current; - Relation rel; - - rel = scan->relation; - current = &(scan->currentItemData); - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - so = (BTScanOpaque) scan->opaque; - maxoff = PageGetMaxOffsetNumber(page); - - /* get the next tuple */ - if (ScanDirectionIsForward(dir)) { - if (!PageIsEmpty(page) && offnum < maxoff) { - offnum = OffsetNumberNext(offnum); - } else { - - /* if we're at end of scan, release the buffer and return */ - blkno = opaque->btpo_next; - if (P_RIGHTMOST(opaque)) { - _bt_relbuf(rel, *bufP, BT_READ); - ItemPointerSetInvalid(current); - *bufP = so->btso_curbuf = InvalidBuffer; - return (false); - } else { - - /* walk right to the next page with data */ - _bt_relbuf(rel, *bufP, BT_READ); - for (;;) { - *bufP = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - if (!PageIsEmpty(page) && start <= maxoff) { - break; - } else { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, + maxoff; + OffsetNumber start; + BlockNumber blkno; + BlockNumber obknum; + BTScanOpaque so; + ItemPointer current; + Relation rel; + + rel = scan->relation; + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + so = (BTScanOpaque) scan->opaque; + maxoff = PageGetMaxOffsetNumber(page); + + /* get the next tuple */ + if (ScanDirectionIsForward(dir)) + { + if (!PageIsEmpty(page) && offnum < maxoff) + { + offnum = OffsetNumberNext(offnum); + } + else + { + + /* if we're at end of scan, release the buffer and return */ blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP, BT_READ); - if (blkno == P_NONE) { - *bufP = so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(current); - return (false); + if (P_RIGHTMOST(opaque)) + { + _bt_relbuf(rel, *bufP, BT_READ); + ItemPointerSetInvalid(current); + *bufP = so->btso_curbuf = InvalidBuffer; + return (false); + } + else + { + + /* walk right to the next page with data */ + _bt_relbuf(rel, *bufP, BT_READ); + for (;;) + { + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (!PageIsEmpty(page) && start <= maxoff) + { + break; + } + else + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + if (blkno == P_NONE) + { + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + } + } + offnum = start; } - } } - offnum = start; - } } - } else if (ScanDirectionIsBackward(dir)) { - - /* remember that high key is item zero on non-rightmost pages */ - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + else if (ScanDirectionIsBackward(dir)) + { - if (offnum > start) { - offnum = OffsetNumberPrev(offnum); - } else { - - /* if we're at end of scan, release the buffer and return */ - blkno = opaque->btpo_prev; - if (P_LEFTMOST(opaque)) { - _bt_relbuf(rel, *bufP, BT_READ); - *bufP = so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(current); - return (false); - } else { - - obknum = BufferGetBlockNumber(*bufP); - - /* walk right to the next page with data */ - _bt_relbuf(rel, *bufP, BT_READ); - for (;;) { - *bufP = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - - /* - * If the adjacent page just split, then we may have the - * wrong block. Handle this case. Because pages only - * split right, we don't have to worry about this failing - * to terminate. - */ - - while (opaque->btpo_next != obknum) { - blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP, BT_READ); - *bufP = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - } - - /* don't consider the high key */ - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* anything to look at here? */ - if (!PageIsEmpty(page) && maxoff >= start) { - break; - } else { + /* remember that high key is item zero on non-rightmost pages */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + if (offnum > start) + { + offnum = OffsetNumberPrev(offnum); + } + else + { + + /* if we're at end of scan, release the buffer and return */ blkno = opaque->btpo_prev; - obknum = BufferGetBlockNumber(*bufP); - _bt_relbuf(rel, *bufP, BT_READ); - if (blkno == P_NONE) { - *bufP = so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(current); - return (false); + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + else + { + + obknum = BufferGetBlockNumber(*bufP); + + /* walk right to the next page with data */ + _bt_relbuf(rel, *bufP, BT_READ); + for (;;) + { + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * If the adjacent page just split, then we may have + * the wrong block. Handle this case. Because pages + * only split right, we don't have to worry about this + * failing to terminate. + */ + + while (opaque->btpo_next != obknum) + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + } + + /* don't consider the high key */ + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* anything to look at here? */ + if (!PageIsEmpty(page) && maxoff >= start) + { + break; + } + else + { + blkno = opaque->btpo_prev; + obknum = BufferGetBlockNumber(*bufP); + _bt_relbuf(rel, *bufP, BT_READ); + if (blkno == P_NONE) + { + *bufP = so->btso_curbuf = InvalidBuffer; + ItemPointerSetInvalid(current); + return (false); + } + } + } + offnum = maxoff;/* XXX PageIsEmpty? */ } - } } - offnum = maxoff; /* XXX PageIsEmpty? */ - } } - } - blkno = BufferGetBlockNumber(*bufP); - so->btso_curbuf = *bufP; - ItemPointerSet(current, blkno, offnum); - - return (true); + blkno = BufferGetBlockNumber(*bufP); + so->btso_curbuf = *bufP; + ItemPointerSet(current, blkno, offnum); + + return (true); } /* - * _bt_twostep() -- Move to an adjacent record in a scan on the tree, - * if an adjacent record exists. + * _bt_twostep() -- Move to an adjacent record in a scan on the tree, + * if an adjacent record exists. * - * This is like _bt_step, except that if no adjacent record exists - * it restores us to where we were before trying the step. This is - * only hairy when you cross page boundaries, since the page you cross - * from could have records inserted or deleted, or could even split. - * This is unlikely, but we try to handle it correctly here anyway. + * This is like _bt_step, except that if no adjacent record exists + * it restores us to where we were before trying the step. This is + * only hairy when you cross page boundaries, since the page you cross + * from could have records inserted or deleted, or could even split. + * This is unlikely, but we try to handle it correctly here anyway. * - * This routine contains the only case in which our changes to Lehman - * and Yao's algorithm. + * This routine contains the only case in which our changes to Lehman + * and Yao's algorithm. * - * Like step, this routine leaves the scan's currentItemData in the - * proper state and acquires a lock and pin on *bufP. If the twostep - * succeeded, we return true; otherwise, we return false. + * Like step, this routine leaves the scan's currentItemData in the + * proper state and acquires a lock and pin on *bufP. If the twostep + * succeeded, we return true; otherwise, we return false. */ -static bool -_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) +static bool +_bt_twostep(IndexScanDesc scan, Buffer * bufP, ScanDirection dir) { - Page page; - BTPageOpaque opaque; - OffsetNumber offnum, maxoff; - OffsetNumber start; - ItemPointer current; - ItemId itemid; - int itemsz; - BTItem btitem; - BTItem svitem; - BlockNumber blkno; - - blkno = BufferGetBlockNumber(*bufP); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - current = &(scan->currentItemData); - offnum = ItemPointerGetOffsetNumber(current); - - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* if we're safe, just do it */ - if (ScanDirectionIsForward(dir) && offnum < maxoff) { /* XXX PageIsEmpty? */ - ItemPointerSet(current, blkno, OffsetNumberNext(offnum)); - return (true); - } else if (ScanDirectionIsBackward(dir) && offnum > start) { - ItemPointerSet(current, blkno, OffsetNumberPrev(offnum)); - return (true); - } - - /* if we've hit end of scan we don't have to do any work */ - if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) { - return (false); - } else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) { - return (false); - } - - /* - * Okay, it's off the page; let _bt_step() do the hard work, and we'll - * try to remember where we were. This is not guaranteed to work; this - * is the only place in the code where concurrency can screw us up, - * and it's because we want to be able to move in two directions in - * the scan. - */ - - itemid = PageGetItemId(page, offnum); - itemsz = ItemIdGetLength(itemid); - btitem = (BTItem) PageGetItem(page, itemid); - svitem = (BTItem) palloc(itemsz); - memmove((char *) svitem, (char *) btitem, itemsz); - - if (_bt_step(scan, bufP, dir)) { - pfree(svitem); - return (true); - } - - /* try to find our place again */ - *bufP = _bt_getbuf(scan->relation, blkno, BT_READ); - page = BufferGetPage(*bufP); - maxoff = PageGetMaxOffsetNumber(page); - - while (offnum <= maxoff) { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum, + maxoff; + OffsetNumber start; + ItemPointer current; + ItemId itemid; + int itemsz; + BTItem btitem; + BTItem svitem; + BlockNumber blkno; + + blkno = BufferGetBlockNumber(*bufP); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + current = &(scan->currentItemData); + offnum = ItemPointerGetOffsetNumber(current); + + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* if we're safe, just do it */ + if (ScanDirectionIsForward(dir) && offnum < maxoff) + { /* XXX PageIsEmpty? */ + ItemPointerSet(current, blkno, OffsetNumberNext(offnum)); + return (true); + } + else if (ScanDirectionIsBackward(dir) && offnum > start) + { + ItemPointerSet(current, blkno, OffsetNumberPrev(offnum)); + return (true); + } + + /* if we've hit end of scan we don't have to do any work */ + if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) + { + return (false); + } + else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) + { + return (false); + } + + /* + * Okay, it's off the page; let _bt_step() do the hard work, and we'll + * try to remember where we were. This is not guaranteed to work; + * this is the only place in the code where concurrency can screw us + * up, and it's because we want to be able to move in two directions + * in the scan. + */ + itemid = PageGetItemId(page, offnum); + itemsz = ItemIdGetLength(itemid); btitem = (BTItem) PageGetItem(page, itemid); - if ( BTItemSame (btitem, svitem) ) { - pfree(svitem); - ItemPointerSet(current, blkno, offnum); - return (false); + svitem = (BTItem) palloc(itemsz); + memmove((char *) svitem, (char *) btitem, itemsz); + + if (_bt_step(scan, bufP, dir)) + { + pfree(svitem); + return (true); + } + + /* try to find our place again */ + *bufP = _bt_getbuf(scan->relation, blkno, BT_READ); + page = BufferGetPage(*bufP); + maxoff = PageGetMaxOffsetNumber(page); + + while (offnum <= maxoff) + { + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + if (BTItemSame(btitem, svitem)) + { + pfree(svitem); + ItemPointerSet(current, blkno, offnum); + return (false); + } } - } - - /* - * XXX crash and burn -- can't find our place. We can be a little - * smarter -- walk to the next page to the right, for example, since - * that's the only direction that splits happen in. Deletions screw - * us up less often since they're only done by the vacuum daemon. - */ - - elog(WARN, "btree synchronization error: concurrent update botched scan"); - - return (false); + + /* + * XXX crash and burn -- can't find our place. We can be a little + * smarter -- walk to the next page to the right, for example, since + * that's the only direction that splits happen in. Deletions screw + * us up less often since they're only done by the vacuum daemon. + */ + + elog(WARN, "btree synchronization error: concurrent update botched scan"); + + return (false); } /* - * _bt_endpoint() -- Find the first or last key in the index. + * _bt_endpoint() -- Find the first or last key in the index. */ -static RetrieveIndexResult +static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { - Relation rel; - Buffer buf; - Page page; - BTPageOpaque opaque; - ItemPointer current; - OffsetNumber offnum, maxoff; - OffsetNumber start = 0; - BlockNumber blkno; - BTItem btitem; - IndexTuple itup; - BTScanOpaque so; - RetrieveIndexResult res; - Size keysok; - - rel = scan->relation; - current = &(scan->currentItemData); - so = (BTScanOpaque) scan->opaque; - - buf = _bt_getroot(rel, BT_READ); - blkno = BufferGetBlockNumber(buf); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - for (;;) { - if (opaque->btpo_flags & BTP_LEAF) - break; - - if (ScanDirectionIsForward(dir)) { - offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - } else { - offnum = PageGetMaxOffsetNumber(page); - } - - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &(btitem->bti_itup); - - blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); - - _bt_relbuf(rel, buf, BT_READ); - buf = _bt_getbuf(rel, blkno, BT_READ); + Relation rel; + Buffer buf; + Page page; + BTPageOpaque opaque; + ItemPointer current; + OffsetNumber offnum, + maxoff; + OffsetNumber start = 0; + BlockNumber blkno; + BTItem btitem; + IndexTuple itup; + BTScanOpaque so; + RetrieveIndexResult res; + Size keysok; + + rel = scan->relation; + current = &(scan->currentItemData); + so = (BTScanOpaque) scan->opaque; + + buf = _bt_getroot(rel, BT_READ); + blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * Race condition: If the child page we just stepped onto is - * in the process of being split, we need to make sure we're - * all the way at the right edge of the tree. See the paper - * by Lehman and Yao. - */ - - if (ScanDirectionIsBackward(dir) && ! P_RIGHTMOST(opaque)) { - do { - blkno = opaque->btpo_next; + + for (;;) + { + if (opaque->btpo_flags & BTP_LEAF) + break; + + if (ScanDirectionIsForward(dir)) + { + offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + } + else + { + offnum = PageGetMaxOffsetNumber(page); + } + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &(btitem->bti_itup); + + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + _bt_relbuf(rel, buf, BT_READ); buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - } while (! P_RIGHTMOST(opaque)); + + /* + * Race condition: If the child page we just stepped onto is in + * the process of being split, we need to make sure we're all the + * way at the right edge of the tree. See the paper by Lehman and + * Yao. + */ + + if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque)) + { + do + { + blkno = opaque->btpo_next; + _bt_relbuf(rel, buf, BT_READ); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } while (!P_RIGHTMOST(opaque)); + } } - } - - /* okay, we've got the {left,right}-most page in the tree */ - maxoff = PageGetMaxOffsetNumber(page); - - if (ScanDirectionIsForward(dir)) { - if ( !P_LEFTMOST(opaque) ) /* non-leftmost page ? */ - elog (WARN, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno); - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - /* - * I don't understand this stuff! It doesn't work for non-rightmost - * pages with only one element (P_HIKEY) which we have after - * deletion itups by vacuum (it's case of start > maxoff). - * Scanning in BackwardScanDirection is not understandable at all. - * Well - new stuff. - vadim 12/06/96 - */ + + /* okay, we've got the {left,right}-most page in the tree */ + maxoff = PageGetMaxOffsetNumber(page); + + if (ScanDirectionIsForward(dir)) + { + if (!P_LEFTMOST(opaque))/* non-leftmost page ? */ + elog(WARN, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno); + start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + + /* + * I don't understand this stuff! It doesn't work for + * non-rightmost pages with only one element (P_HIKEY) which we + * have after deletion itups by vacuum (it's case of start > + * maxoff). Scanning in BackwardScanDirection is not + * understandable at all. Well - new stuff. - vadim 12/06/96 + */ #if 0 - if (PageIsEmpty(page) || start > maxoff) { - ItemPointerSet(current, blkno, maxoff); - if (!_bt_step(scan, &buf, BackwardScanDirection)) - return ((RetrieveIndexResult) NULL); - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } + if (PageIsEmpty(page) || start > maxoff) + { + ItemPointerSet(current, blkno, maxoff); + if (!_bt_step(scan, &buf, BackwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } #endif - if ( PageIsEmpty (page) ) + if (PageIsEmpty(page)) + { + if (start != P_HIKEY) /* non-rightmost page */ + elog(WARN, "_bt_endpoint: non-rightmost page (%u) is empty", blkno); + + /* + * It's left- & right- most page - root page, - and it's + * empty... + */ + _bt_relbuf(rel, buf, BT_READ); + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + return ((RetrieveIndexResult) NULL); + } + if (start > maxoff) /* start == 2 && maxoff == 1 */ + { + ItemPointerSet(current, blkno, maxoff); + if (!_bt_step(scan, &buf, ForwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } + /* new stuff ends here */ + else + { + ItemPointerSet(current, blkno, start); + } + } + else if (ScanDirectionIsBackward(dir)) { - if ( start != P_HIKEY ) /* non-rightmost page */ - elog (WARN, "_bt_endpoint: non-rightmost page (%u) is empty", blkno); - /* It's left- & right- most page - root page, - and it's empty... */ - _bt_relbuf(rel, buf, BT_READ); - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - return ((RetrieveIndexResult) NULL); + + /* + * I don't understand this stuff too! If RIGHT-most leaf page is + * empty why do scanning in ForwardScanDirection ??? Well - new + * stuff. - vadim 12/06/96 + */ +#if 0 + if (PageIsEmpty(page)) + { + ItemPointerSet(current, blkno, FirstOffsetNumber); + if (!_bt_step(scan, &buf, ForwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } +#endif + if (PageIsEmpty(page)) + { + /* If it's leftmost page too - it's empty root page... */ + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, buf, BT_READ); + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + return ((RetrieveIndexResult) NULL); + } + /* Go back ! */ + ItemPointerSet(current, blkno, FirstOffsetNumber); + if (!_bt_step(scan, &buf, BackwardScanDirection)) + return ((RetrieveIndexResult) NULL); + + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } + /* new stuff ends here */ + else + { + start = PageGetMaxOffsetNumber(page); + ItemPointerSet(current, blkno, start); + } } - if ( start > maxoff ) /* start == 2 && maxoff == 1 */ + else { - ItemPointerSet(current, blkno, maxoff); - if (!_bt_step(scan, &buf, ForwardScanDirection)) - return ((RetrieveIndexResult) NULL); - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); + elog(WARN, "Illegal scan direction %d", dir); } - /* new stuff ends here */ - else { - ItemPointerSet(current, blkno, start); + + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); + itup = &(btitem->bti_itup); + + /* see if we picked a winner */ + if (_bt_checkkeys(scan, itup, &keysok)) + { + res = FormRetrieveIndexResult(current, &(itup->t_tid)); + + /* remember which buffer we have pinned */ + so->btso_curbuf = buf; } - } else if (ScanDirectionIsBackward(dir)) { - /* - * I don't understand this stuff too! If RIGHT-most leaf page is - * empty why do scanning in ForwardScanDirection ??? - * Well - new stuff. - vadim 12/06/96 - */ -#if 0 - if (PageIsEmpty(page)) { - ItemPointerSet(current, blkno, FirstOffsetNumber); - if (!_bt_step(scan, &buf, ForwardScanDirection)) - return ((RetrieveIndexResult) NULL); - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); + else if (keysok >= so->numberOfFirstKeys) + { + so->btso_curbuf = buf; + return (_bt_next(scan, dir)); } -#endif - if (PageIsEmpty(page)) + else { - /* If it's leftmost page too - it's empty root page... */ - if ( P_LEFTMOST(opaque) ) - { - _bt_relbuf(rel, buf, BT_READ); ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; - return ((RetrieveIndexResult) NULL); - } - /* Go back ! */ - ItemPointerSet(current, blkno, FirstOffsetNumber); - if (!_bt_step(scan, &buf, BackwardScanDirection)) - return ((RetrieveIndexResult) NULL); - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } - /* new stuff ends here */ - else { - start = PageGetMaxOffsetNumber(page); - ItemPointerSet(current, blkno, start); + _bt_relbuf(rel, buf, BT_READ); + res = (RetrieveIndexResult) NULL; } - } else { - elog(WARN, "Illegal scan direction %d", dir); - } - - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); - itup = &(btitem->bti_itup); - - /* see if we picked a winner */ - if ( _bt_checkkeys (scan, itup, &keysok) ) - { - res = FormRetrieveIndexResult(current, &(itup->t_tid)); - - /* remember which buffer we have pinned */ - so->btso_curbuf = buf; - } - else if ( keysok >= so->numberOfFirstKeys ) - { - so->btso_curbuf = buf; - return (_bt_next (scan, dir)); - } - else - { - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); - res = (RetrieveIndexResult) NULL; - } - - return (res); + + return (res); } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 8e054d24abf..09cb43769f2 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -5,30 +5,30 @@ * * * IDENTIFICATION - * $Id: nbtsort.c,v 1.19 1997/08/19 21:29:46 momjian Exp $ + * $Id: nbtsort.c,v 1.20 1997/09/07 04:39:02 momjian Exp $ * * NOTES * * what we do is: * - generate a set of initial one-block runs, distributed round-robin - * between the output tapes. + * between the output tapes. * - for each pass, - * - swap input and output tape sets, rewinding both and truncating - * the output tapes. - * - merge the current run in each input tape to the current output - * tape. - * - when each input run has been exhausted, switch to another output - * tape and start processing another run. + * - swap input and output tape sets, rewinding both and truncating + * the output tapes. + * - merge the current run in each input tape to the current output + * tape. + * - when each input run has been exhausted, switch to another output + * tape and start processing another run. * - when we have fewer runs than tapes, we know we are ready to start - * merging into the btree leaf pages. (i.e., we do not have to wait - * until we have exactly one tape.) + * merging into the btree leaf pages. (i.e., we do not have to wait + * until we have exactly one tape.) * - as we extract tuples from the final runs, we build the pages for - * each level. when we have only one page on a level, it must be the - * root -- it can be attached to the btree metapage and we are done. + * each level. when we have only one page on a level, it must be the + * root -- it can be attached to the btree metapage and we are done. * * conventions: * - external interface routines take in and return "void *" for their - * opaque handles. this is for modularity reasons. + * opaque handles. this is for modularity reasons. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on @@ -58,20 +58,21 @@ #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif #ifdef BTREE_BUILD_STATS #include <tcop/tcopprot.h> -extern int ShowExecutorStats; +extern int ShowExecutorStats; + #endif -static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags); -static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); -static void *_bt_pagestate(Relation index, int flags, int level, bool doupper); -static void _bt_uppershutdown(Relation index, BTPageState *state); +static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags); +static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); +static void *_bt_pagestate(Relation index, int flags, int level, bool doupper); +static void _bt_uppershutdown(Relation index, BTPageState * state); /* * turn on debugging output. @@ -83,18 +84,18 @@ static void _bt_uppershutdown(Relation index, BTPageState *state); #define FASTBUILD_SPOOL #define FASTBUILD_MERGE -#define MAXTAPES (7) -#define TAPEBLCKSZ (MAXBLCKSZ << 2) -#define TAPETEMP "pg_btsortXXXXXX" +#define MAXTAPES (7) +#define TAPEBLCKSZ (MAXBLCKSZ << 2) +#define TAPETEMP "pg_btsortXXXXXX" -extern int NDirectFileRead; -extern int NDirectFileWrite; -extern char *mktemp(char *template); +extern int NDirectFileRead; +extern int NDirectFileWrite; +extern char *mktemp(char *template); /* - * this is what we use to shovel BTItems in and out of memory. it's + * this is what we use to shovel BTItems in and out of memory. it's * bigger than a standard block because we are doing a lot of strictly - * sequential i/o. this is obviously something of a tradeoff since we + * sequential i/o. this is obviously something of a tradeoff since we * are potentially reading a bunch of zeroes off of disk in many * cases. * @@ -104,14 +105,15 @@ extern char *mktemp(char *template); * the only thing like that so i'm not going to worry about wasting a * few bytes. */ -typedef struct { - int bttb_magic; /* magic number */ - int bttb_fd; /* file descriptor */ - int bttb_top; /* top of free space within bttb_data */ - short bttb_ntup; /* number of tuples in this block */ - short bttb_eor; /* End-Of-Run marker */ - char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; -} BTTapeBlock; +typedef struct +{ + int bttb_magic; /* magic number */ + int bttb_fd; /* file descriptor */ + int bttb_top; /* top of free space within bttb_data */ + short bttb_ntup; /* number of tuples in this block */ + short bttb_eor; /* End-Of-Run marker */ + char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; +} BTTapeBlock; /* * this structure holds the bookkeeping for a simple balanced multiway @@ -120,13 +122,14 @@ typedef struct { * right now. though if psort was in a condition that i could hack it * to do this, you bet i would.) */ -typedef struct { - int bts_ntapes; - int bts_tape; - BTTapeBlock **bts_itape; /* input tape blocks */ - BTTapeBlock **bts_otape; /* output tape blocks */ - bool isunique; -} BTSpool; +typedef struct +{ + int bts_ntapes; + int bts_tape; + BTTapeBlock **bts_itape; /* input tape blocks */ + BTTapeBlock **bts_otape; /* output tape blocks */ + bool isunique; +} BTSpool; /*------------------------------------------------------------------------- * sorting comparison routine - returns {-1,0,1} depending on whether @@ -146,101 +149,102 @@ typedef struct { * what the heck. * *------------------------------------------------------------------------- */ -typedef struct { - Datum *btsk_datum; - char *btsk_nulls; - BTItem btsk_item; -} BTSortKey; +typedef struct +{ + Datum *btsk_datum; + char *btsk_nulls; + BTItem btsk_item; +} BTSortKey; static Relation _bt_sortrel; -static int _bt_nattr; -static BTSpool * _bt_inspool; +static int _bt_nattr; +static BTSpool *_bt_inspool; static void -_bt_isortcmpinit(Relation index, BTSpool *spool) +_bt_isortcmpinit(Relation index, BTSpool * spool) { - _bt_sortrel = index; - _bt_inspool = spool; - _bt_nattr = index->rd_att->natts; + _bt_sortrel = index; + _bt_inspool = spool; + _bt_nattr = index->rd_att->natts; } static int -_bt_isortcmp(BTSortKey *k1, BTSortKey *k2) +_bt_isortcmp(BTSortKey * k1, BTSortKey * k2) { - Datum *k1_datum = k1->btsk_datum; - Datum *k2_datum = k2->btsk_datum; - char *k1_nulls = k1->btsk_nulls; - char *k2_nulls = k2->btsk_nulls; - bool equal_isnull = false; - int i; - - if (k1->btsk_item == (BTItem) NULL) - { - if (k2->btsk_item == (BTItem) NULL) - return(0); /* 1 = 2 */ - return(1); /* 1 > 2 */ - } - else if (k2->btsk_item == (BTItem) NULL) - return(-1); /* 1 < 2 */ - - for (i = 0; i < _bt_nattr; i++) - { - if ( k1_nulls[i] != ' ' ) /* k1 attr is NULL */ + Datum *k1_datum = k1->btsk_datum; + Datum *k2_datum = k2->btsk_datum; + char *k1_nulls = k1->btsk_nulls; + char *k2_nulls = k2->btsk_nulls; + bool equal_isnull = false; + int i; + + if (k1->btsk_item == (BTItem) NULL) { - if ( k2_nulls[i] != ' ' ) /* the same for k2 */ - { - equal_isnull = true; - continue; - } - return (1); /* NULL ">" NOT_NULL */ + if (k2->btsk_item == (BTItem) NULL) + return (0); /* 1 = 2 */ + return (1); /* 1 > 2 */ } - else if ( k2_nulls[i] != ' ' ) /* k2 attr is NULL */ - return (-1); /* NOT_NULL "<" NULL */ - - if (_bt_invokestrat(_bt_sortrel, i+1, BTGreaterStrategyNumber, - k1_datum[i], k2_datum[i])) - return(1); /* 1 > 2 */ - else if (_bt_invokestrat(_bt_sortrel, i+1, BTGreaterStrategyNumber, - k2_datum[i], k1_datum[i])) - return(-1); /* 1 < 2 */ - } - - if ( _bt_inspool->isunique && !equal_isnull ) - { - _bt_spooldestroy ((void*)_bt_inspool); - elog (WARN, "Cannot create unique index. Table contains non-unique values"); - } - return(0); /* 1 = 2 */ + else if (k2->btsk_item == (BTItem) NULL) + return (-1); /* 1 < 2 */ + + for (i = 0; i < _bt_nattr; i++) + { + if (k1_nulls[i] != ' ') /* k1 attr is NULL */ + { + if (k2_nulls[i] != ' ') /* the same for k2 */ + { + equal_isnull = true; + continue; + } + return (1); /* NULL ">" NOT_NULL */ + } + else if (k2_nulls[i] != ' ') /* k2 attr is NULL */ + return (-1); /* NOT_NULL "<" NULL */ + + if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, + k1_datum[i], k2_datum[i])) + return (1); /* 1 > 2 */ + else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, + k2_datum[i], k1_datum[i])) + return (-1); /* 1 < 2 */ + } + + if (_bt_inspool->isunique && !equal_isnull) + { + _bt_spooldestroy((void *) _bt_inspool); + elog(WARN, "Cannot create unique index. Table contains non-unique values"); + } + return (0); /* 1 = 2 */ } static void -_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk) +_bt_setsortkey(Relation index, BTItem bti, BTSortKey * sk) { - sk->btsk_item = (BTItem) NULL; - sk->btsk_datum = (Datum*) NULL; - sk->btsk_nulls = (char*) NULL; - - if (bti != (BTItem) NULL) - { - IndexTuple it = &(bti->bti_itup); - TupleDesc itdesc = index->rd_att; - Datum *dp = (Datum*) palloc (_bt_nattr * sizeof (Datum)); - char *np = (char*) palloc (_bt_nattr * sizeof (char)); - bool isnull; - int i; - - for (i = 0; i < _bt_nattr; i++) - { - dp[i] = index_getattr(it, i+1, itdesc, &isnull); - if ( isnull ) - np[i] = 'n'; - else - np[i] = ' '; + sk->btsk_item = (BTItem) NULL; + sk->btsk_datum = (Datum *) NULL; + sk->btsk_nulls = (char *) NULL; + + if (bti != (BTItem) NULL) + { + IndexTuple it = &(bti->bti_itup); + TupleDesc itdesc = index->rd_att; + Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum)); + char *np = (char *) palloc(_bt_nattr * sizeof(char)); + bool isnull; + int i; + + for (i = 0; i < _bt_nattr; i++) + { + dp[i] = index_getattr(it, i + 1, itdesc, &isnull); + if (isnull) + np[i] = 'n'; + else + np[i] = ' '; + } + sk->btsk_item = bti; + sk->btsk_datum = dp; + sk->btsk_nulls = np; } - sk->btsk_item = bti; - sk->btsk_datum = dp; - sk->btsk_nulls = np; - } } /*------------------------------------------------------------------------- @@ -254,84 +258,100 @@ _bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk) * XXX these probably ought to be generic library functions. *------------------------------------------------------------------------- */ -typedef struct { - int btpqe_tape; /* tape identifier */ - BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */ -} BTPriQueueElem; - -#define MAXELEM MAXTAPES -typedef struct { - int btpq_nelem; - BTPriQueueElem btpq_queue[MAXELEM]; - Relation btpq_rel; -} BTPriQueue; +typedef struct +{ + int btpqe_tape; /* tape identifier */ + BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */ +} BTPriQueueElem; + +#define MAXELEM MAXTAPES +typedef struct +{ + int btpq_nelem; + BTPriQueueElem btpq_queue[MAXELEM]; + Relation btpq_rel; +} BTPriQueue; /* be sure to call _bt_isortcmpinit first */ #define GREATER(a, b) \ - (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) + (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) static void -_bt_pqsift(BTPriQueue *q, int parent) +_bt_pqsift(BTPriQueue * q, int parent) { - int child; - BTPriQueueElem e; - - for (child = parent * 2 + 1; - child < q->btpq_nelem; - child = parent * 2 + 1) { - if (child < q->btpq_nelem - 1) { - if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child+1]))) { - ++child; - } - } - if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) { - e = q->btpq_queue[child]; /* struct = */ - q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ - q->btpq_queue[parent] = e; /* struct = */ - parent = child; - } else { - parent = child + 1; + int child; + BTPriQueueElem e; + + for (child = parent * 2 + 1; + child < q->btpq_nelem; + child = parent * 2 + 1) + { + if (child < q->btpq_nelem - 1) + { + if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1]))) + { + ++child; + } + } + if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) + { + e = q->btpq_queue[child]; /* struct = */ + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + q->btpq_queue[parent] = e; /* struct = */ + parent = child; + } + else + { + parent = child + 1; + } } - } } static int -_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) +_bt_pqnext(BTPriQueue * q, BTPriQueueElem * e) { - if (q->btpq_nelem < 1) { /* already empty */ - return(-1); - } - *e = q->btpq_queue[0]; /* struct = */ - - if (--q->btpq_nelem < 1) { /* now empty, don't sift */ - return(0); - } - q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ - _bt_pqsift(q, 0); - return(0); + if (q->btpq_nelem < 1) + { /* already empty */ + return (-1); + } + *e = q->btpq_queue[0]; /* struct = */ + + if (--q->btpq_nelem < 1) + { /* now empty, don't sift */ + return (0); + } + q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ + _bt_pqsift(q, 0); + return (0); } static void -_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) +_bt_pqadd(BTPriQueue * q, BTPriQueueElem * e) { - int child, parent; - - if (q->btpq_nelem >= MAXELEM) { - elog(WARN, "_bt_pqadd: queue overflow"); - } - - child = q->btpq_nelem++; - while (child > 0) { - parent = child / 2; - if (GREATER(e, &(q->btpq_queue[parent]))) { - break; - } else { - q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ - child = parent; + int child, + parent; + + if (q->btpq_nelem >= MAXELEM) + { + elog(WARN, "_bt_pqadd: queue overflow"); + } + + child = q->btpq_nelem++; + while (child > 0) + { + parent = child / 2; + if (GREATER(e, &(q->btpq_queue[parent]))) + { + break; + } + else + { + q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ + child = parent; + } } - } - q->btpq_queue[child] = *e; /* struct = */ + q->btpq_queue[child] = *e; /* struct = */ } /*------------------------------------------------------------------------- @@ -339,37 +359,37 @@ _bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) *------------------------------------------------------------------------- */ -#define BTITEMSZ(btitem) \ - ((btitem) ? \ - (IndexTupleDSize((btitem)->bti_itup) + \ - (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ - 0) -#define SPCLEFT(tape) \ - (sizeof((tape)->bttb_data) - (tape)->bttb_top) -#define EMPTYTAPE(tape) \ - ((tape)->bttb_ntup <= 0) -#define BTTAPEMAGIC 0x19660226 +#define BTITEMSZ(btitem) \ + ((btitem) ? \ + (IndexTupleDSize((btitem)->bti_itup) + \ + (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ + 0) +#define SPCLEFT(tape) \ + (sizeof((tape)->bttb_data) - (tape)->bttb_top) +#define EMPTYTAPE(tape) \ + ((tape)->bttb_ntup <= 0) +#define BTTAPEMAGIC 0x19660226 /* * reset the tape header for its next use without doing anything to - * the physical tape file. (setting bttb_top to 0 makes the block + * the physical tape file. (setting bttb_top to 0 makes the block * empty.) */ static void -_bt_tapereset(BTTapeBlock *tape) +_bt_tapereset(BTTapeBlock * tape) { - tape->bttb_eor = 0; - tape->bttb_top = 0; - tape->bttb_ntup = 0; + tape->bttb_eor = 0; + tape->bttb_top = 0; + tape->bttb_ntup = 0; } /* * rewind the physical tape file. */ static void -_bt_taperewind(BTTapeBlock *tape) +_bt_taperewind(BTTapeBlock * tape) { - FileSeek(tape->bttb_fd, 0, SEEK_SET); + FileSeek(tape->bttb_fd, 0, SEEK_SET); } /* @@ -382,17 +402,17 @@ _bt_taperewind(BTTapeBlock *tape) * least you don't have to delete and reinsert the directory entries. */ static void -_bt_tapeclear(BTTapeBlock *tape) +_bt_tapeclear(BTTapeBlock * tape) { - /* blow away the contents of the old file */ - _bt_taperewind(tape); + /* blow away the contents of the old file */ + _bt_taperewind(tape); #if 0 - FileSync(tape->bttb_fd); + FileSync(tape->bttb_fd); #endif - FileTruncate(tape->bttb_fd, 0); + FileTruncate(tape->bttb_fd, 0); - /* reset the buffer */ - _bt_tapereset(tape); + /* reset the buffer */ + _bt_tapereset(tape); } /* @@ -402,43 +422,44 @@ _bt_tapeclear(BTTapeBlock *tape) static BTTapeBlock * _bt_tapecreate(char *fname) { - BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); + BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); - if (tape == (BTTapeBlock *) NULL) { - elog(WARN, "_bt_tapecreate: out of memory"); - } + if (tape == (BTTapeBlock *) NULL) + { + elog(WARN, "_bt_tapecreate: out of memory"); + } - tape->bttb_magic = BTTAPEMAGIC; + tape->bttb_magic = BTTAPEMAGIC; - tape->bttb_fd = FileNameOpenFile(fname, O_RDWR|O_CREAT|O_TRUNC, 0600); - Assert(tape->bttb_fd >= 0); + tape->bttb_fd = FileNameOpenFile(fname, O_RDWR | O_CREAT | O_TRUNC, 0600); + Assert(tape->bttb_fd >= 0); - /* initialize the buffer */ - _bt_tapereset(tape); + /* initialize the buffer */ + _bt_tapereset(tape); - return(tape); + return (tape); } /* * destroy the BTTapeBlock structure and its physical tape file. */ static void -_bt_tapedestroy(BTTapeBlock *tape) +_bt_tapedestroy(BTTapeBlock * tape) { - FileUnlink(tape->bttb_fd); - pfree((void *) tape); + FileUnlink(tape->bttb_fd); + pfree((void *) tape); } /* * flush the tape block to the file, marking End-Of-Run if requested. */ static void -_bt_tapewrite(BTTapeBlock *tape, int eor) +_bt_tapewrite(BTTapeBlock * tape, int eor) { - tape->bttb_eor = eor; - FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); - NDirectFileWrite += TAPEBLCKSZ/MAXBLCKSZ; - _bt_tapereset(tape); + tape->bttb_eor = eor; + FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); + NDirectFileWrite += TAPEBLCKSZ / MAXBLCKSZ; + _bt_tapereset(tape); } /* @@ -447,34 +468,36 @@ _bt_tapewrite(BTTapeBlock *tape, int eor) * * returns: * - 0 if there are no more blocks in the tape or in this run (call - * _bt_tapereset to clear the End-Of-Run marker) + * _bt_tapereset to clear the End-Of-Run marker) * - 1 if a valid block was read */ static int -_bt_taperead(BTTapeBlock *tape) +_bt_taperead(BTTapeBlock * tape) { - int fd; - int nread; - - if (tape->bttb_eor) { - return(0); /* we are already at End-Of-Run */ - } - - /* - * we're clobbering the old tape block, but we do need to save the - * VFD (the one in the block we're reading is bogus). - */ - fd = tape->bttb_fd; - nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); - tape->bttb_fd = fd; - - if (nread != TAPEBLCKSZ) { - Assert(nread == 0); /* we are at EOF */ - return(0); - } - Assert(tape->bttb_magic == BTTAPEMAGIC); - NDirectFileRead += TAPEBLCKSZ/MAXBLCKSZ; - return(1); + int fd; + int nread; + + if (tape->bttb_eor) + { + return (0); /* we are already at End-Of-Run */ + } + + /* + * we're clobbering the old tape block, but we do need to save the VFD + * (the one in the block we're reading is bogus). + */ + fd = tape->bttb_fd; + nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); + tape->bttb_fd = fd; + + if (nread != TAPEBLCKSZ) + { + Assert(nread == 0); /* we are at EOF */ + return (0); + } + Assert(tape->bttb_magic == BTTAPEMAGIC); + NDirectFileRead += TAPEBLCKSZ / MAXBLCKSZ; + return (1); } /* @@ -487,19 +510,20 @@ _bt_taperead(BTTapeBlock *tape) * side effects: * - sets 'pos' to the current position within the block. */ -static BTItem -_bt_tapenext(BTTapeBlock *tape, char **pos) +static BTItem +_bt_tapenext(BTTapeBlock * tape, char **pos) { - Size itemsz; - BTItem bti; - - if (*pos >= tape->bttb_data + tape->bttb_top) { - return((BTItem) NULL); - } - bti = (BTItem) *pos; - itemsz = BTITEMSZ(bti); - *pos += DOUBLEALIGN(itemsz); - return(bti); + Size itemsz; + BTItem bti; + + if (*pos >= tape->bttb_data + tape->bttb_top) + { + return ((BTItem) NULL); + } + bti = (BTItem) * pos; + itemsz = BTITEMSZ(bti); + *pos += DOUBLEALIGN(itemsz); + return (bti); } /* @@ -514,11 +538,11 @@ _bt_tapenext(BTTapeBlock *tape, char **pos) * the beginning of free space. */ static void -_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) +_bt_tapeadd(BTTapeBlock * tape, BTItem item, int itemsz) { - memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); - ++tape->bttb_ntup; - tape->bttb_top += DOUBLEALIGN(itemsz); + memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); + ++tape->bttb_ntup; + tape->bttb_top += DOUBLEALIGN(itemsz); } /*------------------------------------------------------------------------- @@ -530,41 +554,44 @@ _bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) * create and initialize a spool structure, including the underlying * files. */ -void * +void * _bt_spoolinit(Relation index, int ntapes, bool isunique) { - BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); - int i; - char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); - - if (btspool == (BTSpool *) NULL || fname == (char *) NULL) { - elog(WARN, "_bt_spoolinit: out of memory"); - } - memset((char *) btspool, 0, sizeof(BTSpool)); - btspool->bts_ntapes = ntapes; - btspool->bts_tape = 0; - btspool->isunique = isunique; - - btspool->bts_itape = - (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); - btspool->bts_otape = - (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); - if (btspool->bts_itape == (BTTapeBlock **) NULL || - btspool->bts_otape == (BTTapeBlock **) NULL) { - elog(WARN, "_bt_spoolinit: out of memory"); - } - - for (i = 0; i < ntapes; ++i) { - btspool->bts_itape[i] = - _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); - btspool->bts_otape[i] = - _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); - } - pfree((void *) fname); - - _bt_isortcmpinit(index, btspool); - - return((void *) btspool); + BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); + int i; + char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); + + if (btspool == (BTSpool *) NULL || fname == (char *) NULL) + { + elog(WARN, "_bt_spoolinit: out of memory"); + } + memset((char *) btspool, 0, sizeof(BTSpool)); + btspool->bts_ntapes = ntapes; + btspool->bts_tape = 0; + btspool->isunique = isunique; + + btspool->bts_itape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + btspool->bts_otape = + (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); + if (btspool->bts_itape == (BTTapeBlock **) NULL || + btspool->bts_otape == (BTTapeBlock **) NULL) + { + elog(WARN, "_bt_spoolinit: out of memory"); + } + + for (i = 0; i < ntapes; ++i) + { + btspool->bts_itape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + btspool->bts_otape[i] = + _bt_tapecreate(mktemp(strcpy(fname, TAPETEMP))); + } + pfree((void *) fname); + + _bt_isortcmpinit(index, btspool); + + return ((void *) btspool); } /* @@ -573,29 +600,32 @@ _bt_spoolinit(Relation index, int ntapes, bool isunique) void _bt_spooldestroy(void *spool) { - BTSpool *btspool = (BTSpool *) spool; - int i; - - for (i = 0; i < btspool->bts_ntapes; ++i) { - _bt_tapedestroy(btspool->bts_otape[i]); - _bt_tapedestroy(btspool->bts_itape[i]); - } - pfree((void *) btspool); + BTSpool *btspool = (BTSpool *) spool; + int i; + + for (i = 0; i < btspool->bts_ntapes; ++i) + { + _bt_tapedestroy(btspool->bts_otape[i]); + _bt_tapedestroy(btspool->bts_itape[i]); + } + pfree((void *) btspool); } /* * flush out any dirty output tape blocks */ static void -_bt_spoolflush(BTSpool *btspool) +_bt_spoolflush(BTSpool * btspool) { - int i; + int i; - for (i = 0; i < btspool->bts_ntapes; ++i) { - if (!EMPTYTAPE(btspool->bts_otape[i])) { - _bt_tapewrite(btspool->bts_otape[i], 1); + for (i = 0; i < btspool->bts_ntapes; ++i) + { + if (!EMPTYTAPE(btspool->bts_otape[i])) + { + _bt_tapewrite(btspool->bts_otape[i], 1); + } } - } } /* @@ -605,36 +635,37 @@ _bt_spoolflush(BTSpool *btspool) * output tapes. */ static void -_bt_spoolswap(BTSpool *btspool) +_bt_spoolswap(BTSpool * btspool) { - File tmpfd; - BTTapeBlock *itape; - BTTapeBlock *otape; - int i; + File tmpfd; + BTTapeBlock *itape; + BTTapeBlock *otape; + int i; - for (i = 0; i < btspool->bts_ntapes; ++i) { - itape = btspool->bts_itape[i]; - otape = btspool->bts_otape[i]; + for (i = 0; i < btspool->bts_ntapes; ++i) + { + itape = btspool->bts_itape[i]; + otape = btspool->bts_otape[i]; - /* - * swap the input and output VFDs. - */ - tmpfd = itape->bttb_fd; - itape->bttb_fd = otape->bttb_fd; - otape->bttb_fd = tmpfd; + /* + * swap the input and output VFDs. + */ + tmpfd = itape->bttb_fd; + itape->bttb_fd = otape->bttb_fd; + otape->bttb_fd = tmpfd; - /* - * rewind the new input tape. - */ - _bt_taperewind(itape); - _bt_tapereset(itape); + /* + * rewind the new input tape. + */ + _bt_taperewind(itape); + _bt_tapereset(itape); - /* - * clear the new output tape -- it's ok to throw away the old - * inputs. - */ - _bt_tapeclear(otape); - } + /* + * clear the new output tape -- it's ok to throw away the old + * inputs. + */ + _bt_tapeclear(otape); + } } /*------------------------------------------------------------------------- @@ -643,7 +674,7 @@ _bt_spoolswap(BTSpool *btspool) */ /* - * spool 'btitem' into an initial run. as tape blocks are filled, the + * spool 'btitem' into an initial run. as tape blocks are filled, the * block BTItems are qsorted and written into some output tape (it * doesn't matter which; we go round-robin for simplicity). the * initial runs are therefore always just one block. @@ -651,134 +682,137 @@ _bt_spoolswap(BTSpool *btspool) void _bt_spool(Relation index, BTItem btitem, void *spool) { - BTSpool *btspool = (BTSpool *) spool; - BTTapeBlock *itape; - Size itemsz; - - _bt_isortcmpinit (index, btspool); - - itape = btspool->bts_itape[btspool->bts_tape]; - itemsz = BTITEMSZ(btitem); - itemsz = DOUBLEALIGN(itemsz); - - /* - * if this buffer is too full for this BTItemData, or if we have - * run out of BTItems, we need to sort the buffer and write it - * out. in this case, the BTItemData will go into the next tape's - * buffer. - */ - if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { - BTSortKey *parray = (BTSortKey *) NULL; - BTTapeBlock *otape; - BTItem bti; - char *pos; - int btisz; - int it_ntup = itape->bttb_ntup; - int i; + BTSpool *btspool = (BTSpool *) spool; + BTTapeBlock *itape; + Size itemsz; - /* - * build an array of pointers to the BTItemDatas on the input - * block. - */ - if (it_ntup > 0) { - parray = - (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey)); - pos = itape->bttb_data; - for (i = 0; i < it_ntup; ++i) { - _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i])); - } - - /* - * qsort the pointer array. - */ - qsort((void *) parray, it_ntup, sizeof(BTSortKey), - (int (*)(const void *,const void *))_bt_isortcmp); - } + _bt_isortcmpinit(index, btspool); + + itape = btspool->bts_itape[btspool->bts_tape]; + itemsz = BTITEMSZ(btitem); + itemsz = DOUBLEALIGN(itemsz); /* - * write the spooled run into the output tape. we copy the - * BTItemDatas in the order dictated by the sorted array of - * BTItems, not the original order. - * - * (since everything was DOUBLEALIGN'd and is all on a single - * tape block, everything had *better* still fit on one tape - * block..) + * if this buffer is too full for this BTItemData, or if we have run + * out of BTItems, we need to sort the buffer and write it out. in + * this case, the BTItemData will go into the next tape's buffer. */ - otape = btspool->bts_otape[btspool->bts_tape]; - for (i = 0; i < it_ntup; ++i) { - bti = parray[i].btsk_item; - btisz = BTITEMSZ(bti); - btisz = DOUBLEALIGN(btisz); - _bt_tapeadd(otape, bti, btisz); + if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) + { + BTSortKey *parray = (BTSortKey *) NULL; + BTTapeBlock *otape; + BTItem bti; + char *pos; + int btisz; + int it_ntup = itape->bttb_ntup; + int i; + + /* + * build an array of pointers to the BTItemDatas on the input + * block. + */ + if (it_ntup > 0) + { + parray = + (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey)); + pos = itape->bttb_data; + for (i = 0; i < it_ntup; ++i) + { + _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i])); + } + + /* + * qsort the pointer array. + */ + qsort((void *) parray, it_ntup, sizeof(BTSortKey), + (int (*) (const void *, const void *)) _bt_isortcmp); + } + + /* + * write the spooled run into the output tape. we copy the + * BTItemDatas in the order dictated by the sorted array of + * BTItems, not the original order. + * + * (since everything was DOUBLEALIGN'd and is all on a single tape + * block, everything had *better* still fit on one tape block..) + */ + otape = btspool->bts_otape[btspool->bts_tape]; + for (i = 0; i < it_ntup; ++i) + { + bti = parray[i].btsk_item; + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + _bt_tapeadd(otape, bti, btisz); #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, - &isnull); - printf("_bt_spool: inserted <%x> into output tape %d\n", - d, btspool->bts_tape); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */ - } + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, + &isnull); - /* - * the initial runs are always single tape blocks. flush the - * output block, marking End-Of-Run. - */ - _bt_tapewrite(otape, 1); + printf("_bt_spool: inserted <%x> into output tape %d\n", + d, btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */ + } - /* - * reset the input buffer for the next run. we don't have to - * write it out or anything -- we only use it to hold the - * unsorted BTItemDatas, the output tape contains all the - * sorted stuff. - * - * changing bts_tape changes the output tape and input tape; - * we change itape for the code below. - */ - _bt_tapereset(itape); - btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; - itape = btspool->bts_itape[btspool->bts_tape]; + /* + * the initial runs are always single tape blocks. flush the + * output block, marking End-Of-Run. + */ + _bt_tapewrite(otape, 1); - /* - * destroy the pointer array. - */ - if (parray != (BTSortKey *) NULL) - { - for (i = 0; i < it_ntup; i++) - { - if ( parray[i].btsk_datum != (Datum*) NULL ) - pfree ((void*)(parray[i].btsk_datum)); - if ( parray[i].btsk_nulls != (char*) NULL ) - pfree ((void*)(parray[i].btsk_nulls)); - } - pfree((void *) parray); + /* + * reset the input buffer for the next run. we don't have to + * write it out or anything -- we only use it to hold the unsorted + * BTItemDatas, the output tape contains all the sorted stuff. + * + * changing bts_tape changes the output tape and input tape; we + * change itape for the code below. + */ + _bt_tapereset(itape); + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + itape = btspool->bts_itape[btspool->bts_tape]; + + /* + * destroy the pointer array. + */ + if (parray != (BTSortKey *) NULL) + { + for (i = 0; i < it_ntup; i++) + { + if (parray[i].btsk_datum != (Datum *) NULL) + pfree((void *) (parray[i].btsk_datum)); + if (parray[i].btsk_nulls != (char *) NULL) + pfree((void *) (parray[i].btsk_nulls)); + } + pfree((void *) parray); + } } - } - /* insert this item into the current buffer */ - if (btitem != (BTItem) NULL) { - _bt_tapeadd(itape, btitem, itemsz); - } + /* insert this item into the current buffer */ + if (btitem != (BTItem) NULL) + { + _bt_tapeadd(itape, btitem, itemsz); + } } /* * allocate a new, clean btree page, not linked to any siblings. */ static void -_bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) +_bt_blnewpage(Relation index, Buffer * buf, Page * page, int flags) { - BTPageOpaque opaque; + BTPageOpaque opaque; - *buf = _bt_getbuf(index, P_NEW, BT_WRITE); + *buf = _bt_getbuf(index, P_NEW, BT_WRITE); #if 0 - printf("\tblk=%d\n", BufferGetBlockNumber(*buf)); + printf("\tblk=%d\n", BufferGetBlockNumber(*buf)); #endif - *page = BufferGetPage(*buf); - _bt_pageinit(*page, BufferGetPageSize(*buf)); - opaque = (BTPageOpaque) PageGetSpecialPointer(*page); - opaque->btpo_prev = opaque->btpo_next = P_NONE; - opaque->btpo_flags = flags; + *page = BufferGetPage(*buf); + _bt_pageinit(*page, BufferGetPageSize(*buf)); + opaque = (BTPageOpaque) PageGetSpecialPointer(*page); + opaque->btpo_prev = opaque->btpo_next = P_NONE; + opaque->btpo_flags = flags; } /* @@ -790,42 +824,44 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) static void _bt_slideleft(Relation index, Buffer buf, Page page) { - OffsetNumber off; - OffsetNumber maxoff; - ItemId previi; - ItemId thisii; - - if (!PageIsEmpty(page)) { - maxoff = PageGetMaxOffsetNumber(page); - previi = PageGetItemId(page, P_HIKEY); - for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { - thisii = PageGetItemId(page, off); - *previi = *thisii; - previi = thisii; + OffsetNumber off; + OffsetNumber maxoff; + ItemId previi; + ItemId thisii; + + if (!PageIsEmpty(page)) + { + maxoff = PageGetMaxOffsetNumber(page); + previi = PageGetItemId(page, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) + { + thisii = PageGetItemId(page, off); + *previi = *thisii; + previi = thisii; + } + ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); } - ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); - } } /* * allocate and initialize a new BTPageState. the returned structure * is suitable for immediate use by _bt_buildadd. */ -static void * +static void * _bt_pagestate(Relation index, int flags, int level, bool doupper) { - BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); - - memset((char *) state, 0, sizeof(BTPageState)); - _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); - state->btps_firstoff = InvalidOffsetNumber; - state->btps_lastoff = P_HIKEY; - state->btps_lastbti = (BTItem) NULL; - state->btps_next = (BTPageState *) NULL; - state->btps_level = level; - state->btps_doupper = doupper; - - return((void *) state); + BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); + + memset((char *) state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); + state->btps_firstoff = InvalidOffsetNumber; + state->btps_lastoff = P_HIKEY; + state->btps_lastbti = (BTItem) NULL; + state->btps_next = (BTPageState *) NULL; + state->btps_level = level; + state->btps_doupper = doupper; + + return ((void *) state); } /* @@ -834,19 +870,19 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper) * the page to which the item used to point, e.g., a heap page if * 'opage' is a leaf page). */ -static BTItem +static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend) { - OffsetNumber off; - BTItem obti; - BTItem nbti; + OffsetNumber off; + BTItem obti; + BTItem nbti; - off = atend ? P_HIKEY : P_FIRSTKEY; - obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); - nbti = _bt_formitem(&(obti->bti_itup)); - ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); + off = atend ? P_HIKEY : P_FIRSTKEY; + obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); + nbti = _bt_formitem(&(obti->bti_itup)); + ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); - return(nbti); + return (nbti); } /* @@ -855,26 +891,26 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * we must be careful to observe the following restrictions, placed * upon us by the conventions in nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at - * P_FIRSTKEY. + * P_FIRSTKEY. * - duplicates cannot be split among pages unless the chain of - * duplicates starts at the first data item. + * duplicates starts at the first data item. * * a leaf page being built looks like: * * +----------------+---------------------------------+ - * | PageHeaderData | linp0 linp1 linp2 ... | + * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ - * | ... linpN | ^ first | + * | ... linpN | ^ first | * +-----------+--------------------------------------+ - * | ^ last | - * | | - * | v last | + * | ^ last | + * | | + * | v last | * +-------------+------------------------------------+ - * | | itemN ... | + * | | itemN ... | * +-------------+------------------+-----------------+ - * | ... item3 item2 item1 | "special space" | + * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ - * ^ first + * ^ first * * contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. this is because we reserve linp0 as a @@ -888,216 +924,230 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * * if all keys are unique, 'first' will always be the same as 'last'. */ -static BTItem +static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) { - BTPageState *state = (BTPageState *) pstate; - Buffer nbuf; - Page npage; - BTItem last_bti; - OffsetNumber first_off; - OffsetNumber last_off; - OffsetNumber off; - Size pgspc; - Size btisz; - - nbuf = state->btps_buf; - npage = state->btps_page; - first_off = state->btps_firstoff; - last_off = state->btps_lastoff; - last_bti = state->btps_lastbti; - - pgspc = PageGetFreeSpace(npage); - btisz = BTITEMSZ(bti); - btisz = DOUBLEALIGN(btisz); - if (pgspc < btisz) { - Buffer obuf = nbuf; - Page opage = npage; - OffsetNumber o, n; - ItemId ii; - ItemId hii; - - _bt_blnewpage(index, &nbuf, &npage, flags); + BTPageState *state = (BTPageState *) pstate; + Buffer nbuf; + Page npage; + BTItem last_bti; + OffsetNumber first_off; + OffsetNumber last_off; + OffsetNumber off; + Size pgspc; + Size btisz; + + nbuf = state->btps_buf; + npage = state->btps_page; + first_off = state->btps_firstoff; + last_off = state->btps_lastoff; + last_bti = state->btps_lastbti; + + pgspc = PageGetFreeSpace(npage); + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (pgspc < btisz) + { + Buffer obuf = nbuf; + Page opage = npage; + OffsetNumber o, + n; + ItemId ii; + ItemId hii; - /* - * if 'last' is part of a chain of duplicates that does not - * start at the beginning of the old page, the entire chain is - * copied to the new page; we delete all of the duplicates - * from the old page except the first, which becomes the high - * key item of the old page. - * - * if the chain starts at the beginning of the page or there - * is no chain ('first' == 'last'), we need only copy 'last' - * to the new page. again, 'first' (== 'last') becomes the - * high key of the old page. - * - * note that in either case, we copy at least one item to the - * new page, so 'last_bti' will always be valid. 'bti' will - * never be the first data item on the new page. - */ - if (first_off == P_FIRSTKEY) { - Assert(last_off != P_FIRSTKEY); - first_off = last_off; - } - for (o = first_off, n = P_FIRSTKEY; - o <= last_off; - o = OffsetNumberNext(o), n = OffsetNumberNext(n)) { - ii = PageGetItemId(opage, o); - if ( PageAddItem(npage, PageGetItem(opage, ii), - ii->lp_len, n, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the page in _bt_sort (1)"); + _bt_blnewpage(index, &nbuf, &npage, flags); + + /* + * if 'last' is part of a chain of duplicates that does not start + * at the beginning of the old page, the entire chain is copied to + * the new page; we delete all of the duplicates from the old page + * except the first, which becomes the high key item of the old + * page. + * + * if the chain starts at the beginning of the page or there is no + * chain ('first' == 'last'), we need only copy 'last' to the new + * page. again, 'first' (== 'last') becomes the high key of the + * old page. + * + * note that in either case, we copy at least one item to the new + * page, so 'last_bti' will always be valid. 'bti' will never be + * the first data item on the new page. + */ + if (first_off == P_FIRSTKEY) + { + Assert(last_off != P_FIRSTKEY); + first_off = last_off; + } + for (o = first_off, n = P_FIRSTKEY; + o <= last_off; + o = OffsetNumberNext(o), n = OffsetNumberNext(n)) + { + ii = PageGetItemId(opage, o); + if (PageAddItem(npage, PageGetItem(opage, ii), + ii->lp_len, n, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)"); #if 0 #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - BTItem tmpbti = - (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); - Datum d = index_getattr(&(tmpbti->bti_itup), 1, - index->rd_att, &isnull); - printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", - d, n, state->btps_level); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + { + bool isnull; + BTItem tmpbti = + (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); + Datum d = index_getattr(&(tmpbti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", + d, n, state->btps_level); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ #endif - } - /* - * this loop is backward because PageIndexTupleDelete shuffles - * the tuples to fill holes in the page -- by starting at the - * end and working back, we won't create holes (and thereby - * avoid shuffling). - */ - for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { - PageIndexTupleDelete(opage, o); - } - hii = PageGetItemId(opage, P_HIKEY); - ii = PageGetItemId(opage, first_off); - *hii = *ii; - ii->lp_flags &= ~LP_USED; - ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); + } - first_off = P_FIRSTKEY; - last_off = PageGetMaxOffsetNumber(npage); - last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); + /* + * this loop is backward because PageIndexTupleDelete shuffles the + * tuples to fill holes in the page -- by starting at the end and + * working back, we won't create holes (and thereby avoid + * shuffling). + */ + for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) + { + PageIndexTupleDelete(opage, o); + } + hii = PageGetItemId(opage, P_HIKEY); + ii = PageGetItemId(opage, first_off); + *hii = *ii; + ii->lp_flags &= ~LP_USED; + ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); - /* - * set the page (side link) pointers. - */ - { - BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); - BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); - - oopaque->btpo_next = BufferGetBlockNumber(nbuf); - nopaque->btpo_prev = BufferGetBlockNumber(obuf); - nopaque->btpo_next = P_NONE; - - if ( _bt_itemcmp(index, _bt_nattr, - (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)), - (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)), - BTEqualStrategyNumber) ) - oopaque->btpo_flags |= BTP_CHAIN; - } + first_off = P_FIRSTKEY; + last_off = PageGetMaxOffsetNumber(npage); + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); - /* - * copy the old buffer's minimum key to its parent. if we - * don't have a parent, we have to create one; this adds a new - * btree level. - */ - if (state->btps_doupper) { - BTItem nbti; - - if (state->btps_next == (BTPageState *) NULL) { - state->btps_next = - _bt_pagestate(index, 0, state->btps_level + 1, true); - } - nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); - _bt_buildadd(index, state->btps_next, nbti, 0); - pfree((void *) nbti); + /* + * set the page (side link) pointers. + */ + { + BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); + BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); + + oopaque->btpo_next = BufferGetBlockNumber(nbuf); + nopaque->btpo_prev = BufferGetBlockNumber(obuf); + nopaque->btpo_next = P_NONE; + + if (_bt_itemcmp(index, _bt_nattr, + (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)), + (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)), + BTEqualStrategyNumber)) + oopaque->btpo_flags |= BTP_CHAIN; + } + + /* + * copy the old buffer's minimum key to its parent. if we don't + * have a parent, we have to create one; this adds a new btree + * level. + */ + if (state->btps_doupper) + { + BTItem nbti; + + if (state->btps_next == (BTPageState *) NULL) + { + state->btps_next = + _bt_pagestate(index, 0, state->btps_level + 1, true); + } + nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); + _bt_buildadd(index, state->btps_next, nbti, 0); + pfree((void *) nbti); + } + + /* + * write out the old stuff. we never want to see it again, so we + * can give up our lock (if we had one; BuildingBtree is set, so + * we aren't locking). + */ + _bt_wrtbuf(index, obuf); } /* - * write out the old stuff. we never want to see it again, so - * we can give up our lock (if we had one; BuildingBtree is - * set, so we aren't locking). + * if this item is different from the last item added, we start a new + * chain of duplicates. */ - _bt_wrtbuf(index, obuf); - } - - /* - * if this item is different from the last item added, we start a - * new chain of duplicates. - */ - off = OffsetNumberNext(last_off); - if ( PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber ) - elog (FATAL, "btree: failed to add item to the page in _bt_sort (2)"); + off = OffsetNumberNext(last_off); + if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)"); #if 0 #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); - printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", - d, off, state->btps_level); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); + + printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", + d, off, state->btps_level); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ #endif - if (last_bti == (BTItem) NULL) - { - first_off = P_FIRSTKEY; - } - else if ( !_bt_itemcmp(index, _bt_nattr, - bti, last_bti, BTEqualStrategyNumber) ) - { - first_off = off; - } - last_off = off; - last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); - - state->btps_buf = nbuf; - state->btps_page = npage; - state->btps_lastbti = last_bti; - state->btps_lastoff = last_off; - state->btps_firstoff = first_off; - - return(last_bti); + if (last_bti == (BTItem) NULL) + { + first_off = P_FIRSTKEY; + } + else if (!_bt_itemcmp(index, _bt_nattr, + bti, last_bti, BTEqualStrategyNumber)) + { + first_off = off; + } + last_off = off; + last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); + + state->btps_buf = nbuf; + state->btps_page = npage; + state->btps_lastbti = last_bti; + state->btps_lastoff = last_off; + state->btps_firstoff = first_off; + + return (last_bti); } static void -_bt_uppershutdown(Relation index, BTPageState *state) +_bt_uppershutdown(Relation index, BTPageState * state) { - BTPageState *s; - BlockNumber blkno; - BTPageOpaque opaque; - BTItem bti; + BTPageState *s; + BlockNumber blkno; + BTPageOpaque opaque; + BTItem bti; - for (s = state; s != (BTPageState *) NULL; s = s->btps_next) { - blkno = BufferGetBlockNumber(s->btps_buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); + for (s = state; s != (BTPageState *) NULL; s = s->btps_next) + { + blkno = BufferGetBlockNumber(s->btps_buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); - /* - * if this is the root, attach it to the metapage. otherwise, - * stick the minimum key of the last page on this level (which - * has not been split, or else it wouldn't be the last page) - * into its parent. this may cause the last page of upper - * levels to split, but that's not a problem -- we haven't - * gotten to them yet. - */ - if (s->btps_doupper) { - if (s->btps_next == (BTPageState *) NULL) { - opaque->btpo_flags |= BTP_ROOT; - _bt_metaproot(index, blkno, s->btps_level + 1); - } else { - bti = _bt_minitem(s->btps_page, blkno, 0); - _bt_buildadd(index, s->btps_next, bti, 0); - pfree((void *) bti); - } - } + /* + * if this is the root, attach it to the metapage. otherwise, + * stick the minimum key of the last page on this level (which has + * not been split, or else it wouldn't be the last page) into its + * parent. this may cause the last page of upper levels to split, + * but that's not a problem -- we haven't gotten to them yet. + */ + if (s->btps_doupper) + { + if (s->btps_next == (BTPageState *) NULL) + { + opaque->btpo_flags |= BTP_ROOT; + _bt_metaproot(index, blkno, s->btps_level + 1); + } + else + { + bti = _bt_minitem(s->btps_page, blkno, 0); + _bt_buildadd(index, s->btps_next, bti, 0); + pfree((void *) bti); + } + } - /* - * this is the rightmost page, so the ItemId array needs to be - * slid back one slot. - */ - _bt_slideleft(index, s->btps_buf, s->btps_page); - _bt_wrtbuf(index, s->btps_buf); - } + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, s->btps_buf, s->btps_page); + _bt_wrtbuf(index, s->btps_buf); + } } /* @@ -1105,203 +1155,230 @@ _bt_uppershutdown(Relation index, BTPageState *state) * merging passes until at most one run is left in each tape. at that * point, merge the final tape runs into a set of btree leaves. * - * XXX three nested loops? gross. cut me up into smaller routines. + * XXX three nested loops? gross. cut me up into smaller routines. */ static void -_bt_merge(Relation index, BTSpool *btspool) +_bt_merge(Relation index, BTSpool * btspool) { - BTPageState *state; - BTPriQueue q; - BTPriQueueElem e; - BTSortKey btsk; - BTItem bti; - BTTapeBlock *itape; - BTTapeBlock *otape; - char *tapepos[MAXTAPES]; - int tapedone[MAXTAPES]; - int t; - int goodtapes; - int npass; - int nruns; - Size btisz; - bool doleaf = false; - - /* - * initialize state needed for the merge into the btree leaf pages. - */ - state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true); - - npass = 0; - do { /* pass */ + BTPageState *state; + BTPriQueue q; + BTPriQueueElem e; + BTSortKey btsk; + BTItem bti; + BTTapeBlock *itape; + BTTapeBlock *otape; + char *tapepos[MAXTAPES]; + int tapedone[MAXTAPES]; + int t; + int goodtapes; + int npass; + int nruns; + Size btisz; + bool doleaf = false; + /* - * each pass starts by flushing the previous outputs and - * swapping inputs and outputs. flushing sets End-of-Run for - * any dirty output tapes. swapping clears the new output - * tapes and rewinds the new input tapes. + * initialize state needed for the merge into the btree leaf pages. */ - btspool->bts_tape = btspool->bts_ntapes - 1; - _bt_spoolflush(btspool); - _bt_spoolswap(btspool); - - ++npass; - nruns = 0; - - for (;;) { /* run */ - /* - * each run starts by selecting a new output tape. the - * merged results of a given run are always sent to this - * one tape. - */ - btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; - otape = btspool->bts_otape[btspool->bts_tape]; - - /* - * initialize the priority queue by loading it with the - * first element of the given run in each tape. since we - * are starting a new run, we reset the tape (clearing the - * End-Of-Run marker) before reading it. this means that - * _bt_taperead will return 0 only if the tape is actually - * at EOF. - */ - memset((char *) &q, 0, sizeof(BTPriQueue)); - goodtapes = 0; - for (t = 0; t < btspool->bts_ntapes; ++t) { - itape = btspool->bts_itape[t]; - tapepos[t] = itape->bttb_data; - tapedone[t] = 0; - _bt_tapereset(itape); - do { - if (_bt_taperead(itape) == 0) { - tapedone[t] = 1; - } - } while (!tapedone[t] && EMPTYTAPE(itape)); - if (!tapedone[t]) { - ++goodtapes; - e.btpqe_tape = t; - _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]), - &(e.btpqe_item)); - if (e.btpqe_item.btsk_item != (BTItem) NULL) { - _bt_pqadd(&q, &e); - } - } - } - /* - * if we don't have any tapes with any input (i.e., they - * are all at EOF), there is no work to do in this run -- - * we must be done with this pass. - */ - if (goodtapes == 0) { - break; /* for */ - } - ++nruns; - - /* - * output the smallest element from the queue until there - * are no more. - */ - while (_bt_pqnext(&q, &e) >= 0) { /* item */ + state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true); + + npass = 0; + do + { /* pass */ + /* - * replace the element taken from priority queue, - * fetching a new block if needed. a tape can run out - * if it hits either End-Of-Run or EOF. + * each pass starts by flushing the previous outputs and swapping + * inputs and outputs. flushing sets End-of-Run for any dirty + * output tapes. swapping clears the new output tapes and rewinds + * the new input tapes. */ - t = e.btpqe_tape; - btsk = e.btpqe_item; - bti = btsk.btsk_item; - if (bti != (BTItem) NULL) { - btisz = BTITEMSZ(bti); - btisz = DOUBLEALIGN(btisz); - if (doleaf) { - _bt_buildadd(index, state, bti, BTP_LEAF); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + btspool->bts_tape = btspool->bts_ntapes - 1; + _bt_spoolflush(btspool); + _bt_spoolswap(btspool); + + ++npass; + nruns = 0; + + for (;;) + { /* run */ + + /* + * each run starts by selecting a new output tape. the merged + * results of a given run are always sent to this one tape. + */ + btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; + otape = btspool->bts_otape[btspool->bts_tape]; + + /* + * initialize the priority queue by loading it with the first + * element of the given run in each tape. since we are + * starting a new run, we reset the tape (clearing the + * End-Of-Run marker) before reading it. this means that + * _bt_taperead will return 0 only if the tape is actually at + * EOF. + */ + memset((char *) &q, 0, sizeof(BTPriQueue)); + goodtapes = 0; + for (t = 0; t < btspool->bts_ntapes; ++t) { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - index->rd_att, &isnull); - printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n", - npass, nruns, d, t, - BufferGetBlockNumber(state->btps_buf)); + itape = btspool->bts_itape[t]; + tapepos[t] = itape->bttb_data; + tapedone[t] = 0; + _bt_tapereset(itape); + do + { + if (_bt_taperead(itape) == 0) + { + tapedone[t] = 1; + } + } while (!tapedone[t] && EMPTYTAPE(itape)); + if (!tapedone[t]) + { + ++goodtapes; + e.btpqe_tape = t; + _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]), + &(e.btpqe_item)); + if (e.btpqe_item.btsk_item != (BTItem) NULL) + { + _bt_pqadd(&q, &e); + } + } } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - } else { - if (SPCLEFT(otape) < btisz) { - /* - * if it's full, write it out and add the - * item to the next block. (since we will - * be adding another tuple immediately - * after this, we can be sure that there - * will be at least one more block in this - * run and so we know we do *not* want to - * set End-Of-Run here.) - */ - _bt_tapewrite(otape, 0); - } - _bt_tapeadd(otape, bti, btisz); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + + /* + * if we don't have any tapes with any input (i.e., they are + * all at EOF), there is no work to do in this run -- we must + * be done with this pass. + */ + if (goodtapes == 0) { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - index->rd_att, &isnull); - printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n", - npass, nruns, d, t, - btspool->bts_tape); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - } - - if ( btsk.btsk_datum != (Datum*) NULL ) - pfree ((void*)(btsk.btsk_datum)); - if ( btsk.btsk_nulls != (char*) NULL ) - pfree ((void*)(btsk.btsk_nulls)); - - } - itape = btspool->bts_itape[t]; - if (!tapedone[t]) { - BTItem newbti = _bt_tapenext(itape, &tapepos[t]); - - if (newbti == (BTItem) NULL) { - do { - if (_bt_taperead(itape) == 0) { - tapedone[t] = 1; - } - } while (!tapedone[t] && EMPTYTAPE(itape)); - if (!tapedone[t]) { - tapepos[t] = itape->bttb_data; - newbti = _bt_tapenext(itape, &tapepos[t]); + break; /* for */ } - } - if (newbti != (BTItem) NULL) { - BTPriQueueElem nexte; - - nexte.btpqe_tape = t; - _bt_setsortkey(index, newbti, &(nexte.btpqe_item)); - _bt_pqadd(&q, &nexte); - } + ++nruns; + + /* + * output the smallest element from the queue until there are + * no more. + */ + while (_bt_pqnext(&q, &e) >= 0) + { /* item */ + + /* + * replace the element taken from priority queue, fetching + * a new block if needed. a tape can run out if it hits + * either End-Of-Run or EOF. + */ + t = e.btpqe_tape; + btsk = e.btpqe_item; + bti = btsk.btsk_item; + if (bti != (BTItem) NULL) + { + btisz = BTITEMSZ(bti); + btisz = DOUBLEALIGN(btisz); + if (doleaf) + { + _bt_buildadd(index, state, bti, BTP_LEAF); +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n", + npass, nruns, d, t, + BufferGetBlockNumber(state->btps_buf)); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + } + else + { + if (SPCLEFT(otape) < btisz) + { + + /* + * if it's full, write it out and add the item + * to the next block. (since we will be + * adding another tuple immediately after + * this, we can be sure that there will be at + * least one more block in this run and so we + * know we do *not* want to set End-Of-Run + * here.) + */ + _bt_tapewrite(otape, 0); + } + _bt_tapeadd(otape, bti, btisz); +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) + { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n", + npass, nruns, d, t, + btspool->bts_tape); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + } + + if (btsk.btsk_datum != (Datum *) NULL) + pfree((void *) (btsk.btsk_datum)); + if (btsk.btsk_nulls != (char *) NULL) + pfree((void *) (btsk.btsk_nulls)); + + } + itape = btspool->bts_itape[t]; + if (!tapedone[t]) + { + BTItem newbti = _bt_tapenext(itape, &tapepos[t]); + + if (newbti == (BTItem) NULL) + { + do + { + if (_bt_taperead(itape) == 0) + { + tapedone[t] = 1; + } + } while (!tapedone[t] && EMPTYTAPE(itape)); + if (!tapedone[t]) + { + tapepos[t] = itape->bttb_data; + newbti = _bt_tapenext(itape, &tapepos[t]); + } + } + if (newbti != (BTItem) NULL) + { + BTPriQueueElem nexte; + + nexte.btpqe_tape = t; + _bt_setsortkey(index, newbti, &(nexte.btpqe_item)); + _bt_pqadd(&q, &nexte); + } + } + } /* item */ + + /* + * that's it for this run. flush the output tape, marking + * End-of-Run. + */ + _bt_tapewrite(otape, 1); + } /* run */ + + /* + * we are here because we ran out of input on all of the input + * tapes. + * + * if this pass did not generate more actual output runs than we have + * tapes, we know we have at most one run in each tape. this + * means that we are ready to merge into the final btree leaf + * pages instead of merging into a tape file. + */ + if (nruns <= btspool->bts_ntapes) + { + doleaf = true; } - } /* item */ - - /* - * that's it for this run. flush the output tape, marking - * End-of-Run. - */ - _bt_tapewrite(otape, 1); - } /* run */ - - /* - * we are here because we ran out of input on all of the input - * tapes. - * - * if this pass did not generate more actual output runs than - * we have tapes, we know we have at most one run in each - * tape. this means that we are ready to merge into the final - * btree leaf pages instead of merging into a tape file. - */ - if (nruns <= btspool->bts_ntapes) { - doleaf = true; - } - } while (nruns > 0); /* pass */ + } while (nruns > 0); /* pass */ - _bt_uppershutdown(index, state); + _bt_uppershutdown(index, state); } @@ -1320,62 +1397,65 @@ _bt_merge(Relation index, BTSpool *btspool) void _bt_upperbuild(Relation index) { - Buffer rbuf; - BlockNumber blk; - Page rpage; - BTPageOpaque ropaque; - BTPageState *state; - BTItem nbti; - - /* - * find the first leaf block. while we're at it, clear the - * BTP_ROOT flag that we set while building it (so we could find - * it later). - */ - rbuf = _bt_getroot(index, BT_WRITE); - blk = BufferGetBlockNumber(rbuf); - rpage = BufferGetPage(rbuf); - ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); - ropaque->btpo_flags &= ~BTP_ROOT; - _bt_wrtbuf(index, rbuf); - - state = (BTPageState *) _bt_pagestate(index, 0, 0, true); - - /* for each page... */ - do { -#if 0 - printf("\t\tblk=%d\n", blk); -#endif - rbuf = _bt_getbuf(index, blk, BT_READ); + Buffer rbuf; + BlockNumber blk; + Page rpage; + BTPageOpaque ropaque; + BTPageState *state; + BTItem nbti; + + /* + * find the first leaf block. while we're at it, clear the BTP_ROOT + * flag that we set while building it (so we could find it later). + */ + rbuf = _bt_getroot(index, BT_WRITE); + blk = BufferGetBlockNumber(rbuf); rpage = BufferGetPage(rbuf); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); - - /* for each item... */ - if (!PageIsEmpty(rpage)) { - /* - * form a new index tuple corresponding to the minimum key - * of the lower page and insert it into a page at this - * level. - */ - nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque)); + ropaque->btpo_flags &= ~BTP_ROOT; + _bt_wrtbuf(index, rbuf); + + state = (BTPageState *) _bt_pagestate(index, 0, 0, true); + + /* for each page... */ + do + { +#if 0 + printf("\t\tblk=%d\n", blk); +#endif + rbuf = _bt_getbuf(index, blk, BT_READ); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + + /* for each item... */ + if (!PageIsEmpty(rpage)) + { + + /* + * form a new index tuple corresponding to the minimum key of + * the lower page and insert it into a page at this level. + */ + nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque)); #if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att, - &isnull); - printf("_bt_upperbuild: inserting <%x> at %d\n", - d, state->btps_level); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - _bt_buildadd(index, state, nbti, 0); - pfree((void *) nbti); - } - blk = ropaque->btpo_next; - _bt_relbuf(index, rbuf, BT_READ); - } while (blk != P_NONE); - - _bt_uppershutdown(index, state); + { + bool isnull; + Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att, + &isnull); + + printf("_bt_upperbuild: inserting <%x> at %d\n", + d, state->btps_level); + } +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + _bt_buildadd(index, state, nbti, 0); + pfree((void *) nbti); + } + blk = ropaque->btpo_next; + _bt_relbuf(index, rbuf, BT_READ); + } while (blk != P_NONE); + + _bt_uppershutdown(index, state); } + #endif /* @@ -1385,17 +1465,17 @@ _bt_upperbuild(Relation index) void _bt_leafbuild(Relation index, void *spool) { - _bt_isortcmpinit (index, (BTSpool *) spool); + _bt_isortcmpinit(index, (BTSpool *) spool); #ifdef BTREE_BUILD_STATS - if ( ShowExecutorStats ) - { - fprintf(stderr, "! BtreeBuild (Spool) Stats:\n"); - ShowUsage (); - ResetUsage (); - } + if (ShowExecutorStats) + { + fprintf(stderr, "! BtreeBuild (Spool) Stats:\n"); + ShowUsage(); + ResetUsage(); + } #endif - _bt_merge(index, (BTSpool *) spool); + _bt_merge(index, (BTSpool *) spool); } diff --git a/src/backend/access/nbtree/nbtstrat.c b/src/backend/access/nbtree/nbtstrat.c index 6de003c06a9..5215d2000d8 100644 --- a/src/backend/access/nbtree/nbtstrat.c +++ b/src/backend/access/nbtree/nbtstrat.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * btstrat.c-- - * Srategy map entries for the btree indexed access method + * Srategy map entries for the btree indexed access method * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.4 1996/11/05 10:35:37 scrappy Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.5 1997/09/07 04:39:04 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -20,111 +20,111 @@ /* * Note: - * StrategyNegate, StrategyCommute, and StrategyNegateCommute - * assume <, <=, ==, >=, > ordering. + * StrategyNegate, StrategyCommute, and StrategyNegateCommute + * assume <, <=, ==, >=, > ordering. */ -static StrategyNumber BTNegate[5] = { - BTGreaterEqualStrategyNumber, - BTGreaterStrategyNumber, - InvalidStrategy, - BTLessStrategyNumber, - BTLessEqualStrategyNumber +static StrategyNumber BTNegate[5] = { + BTGreaterEqualStrategyNumber, + BTGreaterStrategyNumber, + InvalidStrategy, + BTLessStrategyNumber, + BTLessEqualStrategyNumber }; -static StrategyNumber BTCommute[5] = { - BTGreaterStrategyNumber, - BTGreaterEqualStrategyNumber, - InvalidStrategy, - BTLessEqualStrategyNumber, - BTLessStrategyNumber +static StrategyNumber BTCommute[5] = { + BTGreaterStrategyNumber, + BTGreaterEqualStrategyNumber, + InvalidStrategy, + BTLessEqualStrategyNumber, + BTLessStrategyNumber }; -static StrategyNumber BTNegateCommute[5] = { - BTLessEqualStrategyNumber, - BTLessStrategyNumber, - InvalidStrategy, - BTGreaterStrategyNumber, - BTGreaterEqualStrategyNumber +static StrategyNumber BTNegateCommute[5] = { + BTLessEqualStrategyNumber, + BTLessStrategyNumber, + InvalidStrategy, + BTGreaterStrategyNumber, + BTGreaterEqualStrategyNumber }; -static uint16 BTLessTermData[] = { /* XXX type clash */ - 2, - BTLessStrategyNumber, - SK_NEGATE, - BTLessStrategyNumber, - SK_NEGATE | SK_COMMUTE +static uint16 BTLessTermData[] = { /* XXX type clash */ + 2, + BTLessStrategyNumber, + SK_NEGATE, + BTLessStrategyNumber, + SK_NEGATE | SK_COMMUTE }; -static uint16 BTLessEqualTermData[] = { /* XXX type clash */ - 2, - BTLessEqualStrategyNumber, - 0x0, - BTLessEqualStrategyNumber, - SK_COMMUTE +static uint16 BTLessEqualTermData[] = { /* XXX type clash */ + 2, + BTLessEqualStrategyNumber, + 0x0, + BTLessEqualStrategyNumber, + SK_COMMUTE }; static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */ - 2, - BTGreaterEqualStrategyNumber, - 0x0, - BTGreaterEqualStrategyNumber, - SK_COMMUTE - }; - -static uint16 BTGreaterTermData[] = { /* XXX type clash */ - 2, - BTGreaterStrategyNumber, - SK_NEGATE, - BTGreaterStrategyNumber, - SK_NEGATE | SK_COMMUTE + 2, + BTGreaterEqualStrategyNumber, + 0x0, + BTGreaterEqualStrategyNumber, + SK_COMMUTE }; -static StrategyTerm BTEqualExpressionData[] = { - (StrategyTerm)BTLessTermData, /* XXX */ - (StrategyTerm)BTLessEqualTermData, /* XXX */ - (StrategyTerm)BTGreaterEqualTermData, /* XXX */ - (StrategyTerm)BTGreaterTermData, /* XXX */ - NULL +static uint16 BTGreaterTermData[] = { /* XXX type clash */ + 2, + BTGreaterStrategyNumber, + SK_NEGATE, + BTGreaterStrategyNumber, + SK_NEGATE | SK_COMMUTE }; -static StrategyEvaluationData BTEvaluationData = { - /* XXX static for simplicity */ - - BTMaxStrategyNumber, - (StrategyTransformMap)BTNegate, /* XXX */ - (StrategyTransformMap)BTCommute, /* XXX */ - (StrategyTransformMap)BTNegateCommute, /* XXX */ +static StrategyTerm BTEqualExpressionData[] = { + (StrategyTerm) BTLessTermData, /* XXX */ + (StrategyTerm) BTLessEqualTermData, /* XXX */ + (StrategyTerm) BTGreaterEqualTermData, /* XXX */ + (StrategyTerm) BTGreaterTermData, /* XXX */ + NULL +}; + +static StrategyEvaluationData BTEvaluationData = { + /* XXX static for simplicity */ + + BTMaxStrategyNumber, + (StrategyTransformMap) BTNegate, /* XXX */ + (StrategyTransformMap) BTCommute, /* XXX */ + (StrategyTransformMap) BTNegateCommute, /* XXX */ - { NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL, - NULL,NULL,NULL,NULL,NULL,NULL,NULL} + {NULL, NULL, (StrategyExpression) BTEqualExpressionData, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL} }; /* ---------------------------------------------------------------- - * RelationGetBTStrategy + * RelationGetBTStrategy * ---------------------------------------------------------------- */ StrategyNumber _bt_getstrat(Relation rel, - AttrNumber attno, - RegProcedure proc) + AttrNumber attno, + RegProcedure proc) { - StrategyNumber strat; - - strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc); - - Assert(StrategyNumberIsValid(strat)); - - return (strat); + StrategyNumber strat; + + strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc); + + Assert(StrategyNumberIsValid(strat)); + + return (strat); } bool _bt_invokestrat(Relation rel, - AttrNumber attno, - StrategyNumber strat, - Datum left, - Datum right) + AttrNumber attno, + StrategyNumber strat, + Datum left, + Datum right) { - return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat, - left, right)); + return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat, + left, right)); } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 738e55dbccd..096f1d2691e 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1,13 +1,13 @@ /*------------------------------------------------------------------------- * * btutils.c-- - * Utility code for Postgres btree implementation. + * Utility code for Postgres btree implementation. * * Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.11 1997/08/19 21:29:47 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.12 1997/09/07 04:39:05 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -23,367 +23,384 @@ #include <catalog/pg_proc.h> #include <executor/execdebug.h> -extern int NIndexTupleProcessed; +extern int NIndexTupleProcessed; #ifndef HAVE_MEMMOVE -# include <regex/utils.h> +#include <regex/utils.h> #else -# include <string.h> +#include <string.h> #endif -ScanKey +ScanKey _bt_mkscankey(Relation rel, IndexTuple itup) -{ - ScanKey skey; - TupleDesc itupdesc; - int natts; - int i; - Datum arg; - RegProcedure proc; - bool null; - bits16 flag; - - natts = rel->rd_rel->relnatts; - itupdesc = RelationGetTupleDescriptor(rel); - - skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); - - for (i = 0; i < natts; i++) { - arg = index_getattr(itup, i + 1, itupdesc, &null); - if ( null ) - { - proc = NullValueRegProcedure; - flag = SK_ISNULL; - } - else +{ + ScanKey skey; + TupleDesc itupdesc; + int natts; + int i; + Datum arg; + RegProcedure proc; + bool null; + bits16 flag; + + natts = rel->rd_rel->relnatts; + itupdesc = RelationGetTupleDescriptor(rel); + + skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); + + for (i = 0; i < natts; i++) { - proc = index_getprocid(rel, i + 1, BTORDER_PROC); - flag = 0x0; + arg = index_getattr(itup, i + 1, itupdesc, &null); + if (null) + { + proc = NullValueRegProcedure; + flag = SK_ISNULL; + } + else + { + proc = index_getprocid(rel, i + 1, BTORDER_PROC); + flag = 0x0; + } + ScanKeyEntryInitialize(&skey[i], + flag, (AttrNumber) (i + 1), proc, arg); } - ScanKeyEntryInitialize(&skey[i], - flag, (AttrNumber) (i + 1), proc, arg); - } - - return (skey); + + return (skey); } void _bt_freeskey(ScanKey skey) { - pfree(skey); + pfree(skey); } void _bt_freestack(BTStack stack) { - BTStack ostack; - - while (stack != (BTStack) NULL) { - ostack = stack; - stack = stack->bts_parent; - pfree(ostack->bts_btitem); - pfree(ostack); - } + BTStack ostack; + + while (stack != (BTStack) NULL) + { + ostack = stack; + stack = stack->bts_parent; + pfree(ostack->bts_btitem); + pfree(ostack); + } } /* - * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals. + * _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals. * - * The order of the keys in the qual match the ordering imposed by - * the index. This routine only needs to be called if there are - * more than one qual clauses using this index. + * The order of the keys in the qual match the ordering imposed by + * the index. This routine only needs to be called if there are + * more than one qual clauses using this index. */ void _bt_orderkeys(Relation relation, BTScanOpaque so) { - ScanKey xform; - ScanKeyData *cur; - StrategyMap map; - int nbytes; - long test; - int i, j; - int init[BTMaxStrategyNumber+1]; - ScanKey key; - uint16 numberOfKeys = so->numberOfKeys; - uint16 new_numberOfKeys = 0; - AttrNumber attno = 1; - - if ( numberOfKeys < 1 ) - return; - - key = so->keyData; - - cur = &key[0]; - if ( cur->sk_attno != 1 ) - elog (WARN, "_bt_orderkeys: key(s) for attribute 1 missed"); - - if ( numberOfKeys == 1 ) - { - /* - * We don't use indices for 'A is null' and 'A is not null' - * currently and 'A < = > <> NULL' is non-sense' - so - * qual is not Ok. - vadim 03/21/97 - */ - if ( cur->sk_flags & SK_ISNULL ) - so->qual_ok = 0; - so->numberOfFirstKeys = 1; - return; - } - - /* get space for the modified array of keys */ - nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData); - xform = (ScanKey) palloc(nbytes); - - memset(xform, 0, nbytes); - map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), - BTMaxStrategyNumber, - attno); - for (j = 0; j <= BTMaxStrategyNumber; j++) - init[j] = 0; - - /* check each key passed in */ - for (i = 0; ; ) - { - if ( i < numberOfKeys ) - cur = &key[i]; - - if ( cur->sk_flags & SK_ISNULL ) /* see comments above */ - so->qual_ok = 0; - - if ( i == numberOfKeys || cur->sk_attno != attno ) + ScanKey xform; + ScanKeyData *cur; + StrategyMap map; + int nbytes; + long test; + int i, + j; + int init[BTMaxStrategyNumber + 1]; + ScanKey key; + uint16 numberOfKeys = so->numberOfKeys; + uint16 new_numberOfKeys = 0; + AttrNumber attno = 1; + + if (numberOfKeys < 1) + return; + + key = so->keyData; + + cur = &key[0]; + if (cur->sk_attno != 1) + elog(WARN, "_bt_orderkeys: key(s) for attribute 1 missed"); + + if (numberOfKeys == 1) { - if ( cur->sk_attno != attno + 1 && i < numberOfKeys ) - { - elog (WARN, "_bt_orderkeys: key(s) for attribute %d missed", attno + 1); - } - /* - * If = has been specified, no other key will be used. - * In case of key < 2 && key == 1 and so on - * we have to set qual_ok to 0 - */ - if (init[BTEqualStrategyNumber - 1]) - { - ScanKeyData *eq, *chk; - - eq = &xform[BTEqualStrategyNumber - 1]; - for (j = BTMaxStrategyNumber; --j >= 0; ) - { - if ( j == (BTEqualStrategyNumber - 1) || init[j] == 0 ) - continue; - chk = &xform[j]; - test = (long) fmgr(chk->sk_procedure, eq->sk_argument, chk->sk_argument); - if (!test) - so->qual_ok = 0; - } - init[BTLessStrategyNumber - 1] = 0; - init[BTLessEqualStrategyNumber - 1] = 0; - init[BTGreaterEqualStrategyNumber - 1] = 0; - init[BTGreaterStrategyNumber - 1] = 0; - } - - /* only one of <, <= */ - if (init[BTLessStrategyNumber - 1] - && init[BTLessEqualStrategyNumber - 1]) - { - ScanKeyData *lt, *le; - - lt = &xform[BTLessStrategyNumber - 1]; - le = &xform[BTLessEqualStrategyNumber - 1]; + /* - * DO NOT use the cached function stuff here -- this is key - * ordering, happens only when the user expresses a hokey - * qualification, and gets executed only once, anyway. The - * transform maps are hard-coded, and can't be initialized - * in the correct way. + * We don't use indices for 'A is null' and 'A is not null' + * currently and 'A < = > <> NULL' is non-sense' - so qual is not + * Ok. - vadim 03/21/97 */ - test = (long) fmgr(le->sk_procedure, lt->sk_argument, le->sk_argument); - if (test) - init[BTLessEqualStrategyNumber - 1] = 0; - else - init[BTLessStrategyNumber - 1] = 0; - } - - /* only one of >, >= */ - if (init[BTGreaterStrategyNumber - 1] - && init[BTGreaterEqualStrategyNumber - 1]) - { - ScanKeyData *gt, *ge; - - gt = &xform[BTGreaterStrategyNumber - 1]; - ge = &xform[BTGreaterEqualStrategyNumber - 1]; - - /* see note above on function cache */ - test = (long) fmgr(ge->sk_procedure, gt->sk_argument, ge->sk_argument); - if (test) - init[BTGreaterEqualStrategyNumber - 1] = 0; - else - init[BTGreaterStrategyNumber - 1] = 0; - } - - /* okay, reorder and count */ - for (j = BTMaxStrategyNumber; --j >= 0; ) - if (init[j]) - key[new_numberOfKeys++] = xform[j]; - - if ( attno == 1 ) - so->numberOfFirstKeys = new_numberOfKeys; - - if ( i == numberOfKeys ) - break; - - /* initialization for new attno */ - attno = cur->sk_attno; - memset(xform, 0, nbytes); - map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), - BTMaxStrategyNumber, - attno); - /* haven't looked at any strategies yet */ - for (j = 0; j <= BTMaxStrategyNumber; j++) - init[j] = 0; + if (cur->sk_flags & SK_ISNULL) + so->qual_ok = 0; + so->numberOfFirstKeys = 1; + return; } - for (j = BTMaxStrategyNumber; --j >= 0; ) - { - if (cur->sk_procedure == map->entry[j].sk_procedure) - break; - } - - /* have we seen one of these before? */ - if (init[j]) - { - /* yup, use the appropriate value */ - test = - (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure, - cur->sk_argument, xform[j].sk_argument); - if (test) - xform[j].sk_argument = cur->sk_argument; - else if ( j == (BTEqualStrategyNumber - 1) ) - so->qual_ok = 0; /* key == a && key == b, but a != b */ - } else + /* get space for the modified array of keys */ + nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData); + xform = (ScanKey) palloc(nbytes); + + memset(xform, 0, nbytes); + map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + BTMaxStrategyNumber, + attno); + for (j = 0; j <= BTMaxStrategyNumber; j++) + init[j] = 0; + + /* check each key passed in */ + for (i = 0;;) { - /* nope, use this value */ - memmove(&xform[j], cur, sizeof(*cur)); - init[j] = 1; + if (i < numberOfKeys) + cur = &key[i]; + + if (cur->sk_flags & SK_ISNULL) /* see comments above */ + so->qual_ok = 0; + + if (i == numberOfKeys || cur->sk_attno != attno) + { + if (cur->sk_attno != attno + 1 && i < numberOfKeys) + { + elog(WARN, "_bt_orderkeys: key(s) for attribute %d missed", attno + 1); + } + + /* + * If = has been specified, no other key will be used. In case + * of key < 2 && key == 1 and so on we have to set qual_ok to + * 0 + */ + if (init[BTEqualStrategyNumber - 1]) + { + ScanKeyData *eq, + *chk; + + eq = &xform[BTEqualStrategyNumber - 1]; + for (j = BTMaxStrategyNumber; --j >= 0;) + { + if (j == (BTEqualStrategyNumber - 1) || init[j] == 0) + continue; + chk = &xform[j]; + test = (long) fmgr(chk->sk_procedure, eq->sk_argument, chk->sk_argument); + if (!test) + so->qual_ok = 0; + } + init[BTLessStrategyNumber - 1] = 0; + init[BTLessEqualStrategyNumber - 1] = 0; + init[BTGreaterEqualStrategyNumber - 1] = 0; + init[BTGreaterStrategyNumber - 1] = 0; + } + + /* only one of <, <= */ + if (init[BTLessStrategyNumber - 1] + && init[BTLessEqualStrategyNumber - 1]) + { + ScanKeyData *lt, + *le; + + lt = &xform[BTLessStrategyNumber - 1]; + le = &xform[BTLessEqualStrategyNumber - 1]; + + /* + * DO NOT use the cached function stuff here -- this is + * key ordering, happens only when the user expresses a + * hokey qualification, and gets executed only once, + * anyway. The transform maps are hard-coded, and can't + * be initialized in the correct way. + */ + test = (long) fmgr(le->sk_procedure, lt->sk_argument, le->sk_argument); + if (test) + init[BTLessEqualStrategyNumber - 1] = 0; + else + init[BTLessStrategyNumber - 1] = 0; + } + + /* only one of >, >= */ + if (init[BTGreaterStrategyNumber - 1] + && init[BTGreaterEqualStrategyNumber - 1]) + { + ScanKeyData *gt, + *ge; + + gt = &xform[BTGreaterStrategyNumber - 1]; + ge = &xform[BTGreaterEqualStrategyNumber - 1]; + + /* see note above on function cache */ + test = (long) fmgr(ge->sk_procedure, gt->sk_argument, ge->sk_argument); + if (test) + init[BTGreaterEqualStrategyNumber - 1] = 0; + else + init[BTGreaterStrategyNumber - 1] = 0; + } + + /* okay, reorder and count */ + for (j = BTMaxStrategyNumber; --j >= 0;) + if (init[j]) + key[new_numberOfKeys++] = xform[j]; + + if (attno == 1) + so->numberOfFirstKeys = new_numberOfKeys; + + if (i == numberOfKeys) + break; + + /* initialization for new attno */ + attno = cur->sk_attno; + memset(xform, 0, nbytes); + map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation), + BTMaxStrategyNumber, + attno); + /* haven't looked at any strategies yet */ + for (j = 0; j <= BTMaxStrategyNumber; j++) + init[j] = 0; + } + + for (j = BTMaxStrategyNumber; --j >= 0;) + { + if (cur->sk_procedure == map->entry[j].sk_procedure) + break; + } + + /* have we seen one of these before? */ + if (init[j]) + { + /* yup, use the appropriate value */ + test = + (long) FMGR_PTR2(cur->sk_func, cur->sk_procedure, + cur->sk_argument, xform[j].sk_argument); + if (test) + xform[j].sk_argument = cur->sk_argument; + else if (j == (BTEqualStrategyNumber - 1)) + so->qual_ok = 0;/* key == a && key == b, but a != b */ + } + else + { + /* nope, use this value */ + memmove(&xform[j], cur, sizeof(*cur)); + init[j] = 1; + } + + i++; } - - i++; - } - - so->numberOfKeys = new_numberOfKeys; - - pfree(xform); + + so->numberOfKeys = new_numberOfKeys; + + pfree(xform); } BTItem _bt_formitem(IndexTuple itup) { - int nbytes_btitem; - BTItem btitem; - Size tuplen; - extern Oid newoid(); - - /* see comments in btbuild - - if (itup->t_info & INDEX_NULL_MASK) - elog(WARN, "btree indices cannot include null keys"); - */ - - /* make a copy of the index tuple with room for the sequence number */ - tuplen = IndexTupleSize(itup); - nbytes_btitem = tuplen + - (sizeof(BTItemData) - sizeof(IndexTupleData)); - - btitem = (BTItem) palloc(nbytes_btitem); - memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen); - + int nbytes_btitem; + BTItem btitem; + Size tuplen; + extern Oid newoid(); + + /* + * see comments in btbuild + * + * if (itup->t_info & INDEX_NULL_MASK) elog(WARN, "btree indices cannot + * include null keys"); + */ + + /* make a copy of the index tuple with room for the sequence number */ + tuplen = IndexTupleSize(itup); + nbytes_btitem = tuplen + + (sizeof(BTItemData) - sizeof(IndexTupleData)); + + btitem = (BTItem) palloc(nbytes_btitem); + memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen); + #ifndef BTREE_VERSION_1 - btitem->bti_oid = newoid(); + btitem->bti_oid = newoid(); #endif - return (btitem); + return (btitem); } #ifdef NOT_USED bool _bt_checkqual(IndexScanDesc scan, IndexTuple itup) { - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - if (so->numberOfKeys > 0) - return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation), - so->numberOfKeys, so->keyData)); - else - return (true); + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + if (so->numberOfKeys > 0) + return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation), + so->numberOfKeys, so->keyData)); + else + return (true); } + #endif #ifdef NOT_USED bool _bt_checkforkeys(IndexScanDesc scan, IndexTuple itup, Size keysz) { - BTScanOpaque so; - - so = (BTScanOpaque) scan->opaque; - if ( keysz > 0 && so->numberOfKeys >= keysz ) - return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation), - keysz, so->keyData)); - else - return (true); + BTScanOpaque so; + + so = (BTScanOpaque) scan->opaque; + if (keysz > 0 && so->numberOfKeys >= keysz) + return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation), + keysz, so->keyData)); + else + return (true); } + #endif bool -_bt_checkkeys (IndexScanDesc scan, IndexTuple tuple, Size *keysok) +_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, Size * keysok) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Size keysz = so->numberOfKeys; - TupleDesc tupdesc; - ScanKey key; - Datum datum; - bool isNull; - int test; - - *keysok = 0; - if ( keysz == 0 ) - return (true); - - key = so->keyData; - tupdesc = RelationGetTupleDescriptor(scan->relation); - - IncrIndexProcessed(); - - while (keysz > 0) - { - datum = index_getattr(tuple, - key[0].sk_attno, - tupdesc, - &isNull); - - /* btree doesn't support 'A is null' clauses, yet */ - if ( isNull || key[0].sk_flags & SK_ISNULL ) + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Size keysz = so->numberOfKeys; + TupleDesc tupdesc; + ScanKey key; + Datum datum; + bool isNull; + int test; + + *keysok = 0; + if (keysz == 0) + return (true); + + key = so->keyData; + tupdesc = RelationGetTupleDescriptor(scan->relation); + + IncrIndexProcessed(); + + while (keysz > 0) { - return (false); - } + datum = index_getattr(tuple, + key[0].sk_attno, + tupdesc, + &isNull); - if (key[0].sk_flags & SK_COMMUTE) { - test = (int) (*(key[0].sk_func)) - (DatumGetPointer(key[0].sk_argument), - datum); - } else { - test = (int) (*(key[0].sk_func)) - (datum, - DatumGetPointer(key[0].sk_argument)); - } - - if (!test == !(key[0].sk_flags & SK_NEGATE)) { - return (false); + /* btree doesn't support 'A is null' clauses, yet */ + if (isNull || key[0].sk_flags & SK_ISNULL) + { + return (false); + } + + if (key[0].sk_flags & SK_COMMUTE) + { + test = (int) (*(key[0].sk_func)) + (DatumGetPointer(key[0].sk_argument), + datum); + } + else + { + test = (int) (*(key[0].sk_func)) + (datum, + DatumGetPointer(key[0].sk_argument)); + } + + if (!test == !(key[0].sk_flags & SK_NEGATE)) + { + return (false); + } + + keysz -= 1; + key++; + (*keysok)++; } - - keysz -= 1; - key++; - (*keysok)++; - } - - return (true); + + return (true); } |