diff options
author | Tom Lane | 2010-02-07 20:48:13 +0000 |
---|---|---|
committer | Tom Lane | 2010-02-07 20:48:13 +0000 |
commit | b9b8831ad60f6e4bd580fe6dbe9749359298a3c4 (patch) | |
tree | af6948498f13a43edd982b05808ed89b5b8191ab /src | |
parent | 7fc30c488fc6e9674564206193c29b1a657a818f (diff) |
Create a "relation mapping" infrastructure to support changing the relfilenodes
of shared or nailed system catalogs. This has two key benefits:
* The new CLUSTER-based VACUUM FULL can be applied safely to all catalogs.
* We no longer have to use an unsafe reindex-in-place approach for reindexing
shared catalogs.
CLUSTER on nailed catalogs now works too, although I left it disabled on
shared catalogs because the resulting pg_index.indisclustered update would
only be visible in one database.
Since reindexing shared system catalogs is now fully transactional and
crash-safe, the former special cases in REINDEX behavior have been removed;
shared catalogs are treated the same as non-shared.
This commit does not do anything about the recently-discussed problem of
deadlocks between VACUUM FULL/CLUSTER on a system catalog and other
concurrent queries; will address that in a separate patch. As a stopgap,
parallel_schedule has been tweaked to run vacuum.sql by itself, to avoid
such failures during the regression tests.
Diffstat (limited to 'src')
46 files changed, 2187 insertions, 507 deletions
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index ddc3f6b4f96..bd280360879 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.79 2010/01/02 16:57:35 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.80 2010/02/07 20:48:09 tgl Exp $ * * NOTES * many of the old access method routines have been turned into @@ -21,6 +21,7 @@ #include "access/relscan.h" #include "access/transam.h" +#include "catalog/index.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" @@ -419,7 +420,7 @@ systable_beginscan_ordered(Relation heapRelation, /* REINDEX can probably be a hard error here ... */ if (ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) - elog(ERROR, "cannot do ordered scan on index \"%s\", because it is the current REINDEX target", + elog(ERROR, "cannot do ordered scan on index \"%s\", because it is being reindexed", RelationGetRelationName(indexRelation)); /* ... but we only throw a warning about violating IgnoreSystemIndexes */ if (IgnoreSystemIndexes) diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 7e1e0f60fc3..8038b25d1d0 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.28 2009/12/19 01:32:33 sriggs Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.29 2010/02/07 20:48:09 tgl Exp $ */ #include "postgres.h" @@ -22,6 +22,7 @@ #include "commands/tablespace.h" #include "storage/freespace.h" #include "storage/standby.h" +#include "utils/relmapper.h" const RmgrData RmgrTable[RM_MAX_ID + 1] = { @@ -32,7 +33,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL}, {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, - {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, + {"RelMap", relmap_redo, relmap_desc, NULL, NULL, NULL}, {"Standby", standby_redo, standby_desc, NULL, NULL, NULL}, {"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL}, {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index f74a941f66e..156ed5c47be 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.282 2010/01/24 21:49:17 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.283 2010/02/07 20:48:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,6 +48,7 @@ #include "utils/inval.h" #include "utils/memutils.h" #include "utils/relcache.h" +#include "utils/relmapper.h" #include "utils/snapmgr.h" #include "pg_trace.h" @@ -250,7 +251,7 @@ static void AbortTransaction(void); static void AtAbort_Memory(void); static void AtCleanup_Memory(void); static void AtAbort_ResourceOwner(void); -static void AtCommit_LocalCache(void); +static void AtCCI_LocalCache(void); static void AtCommit_Memory(void); static void AtStart_Cache(void); static void AtStart_Memory(void); @@ -703,7 +704,7 @@ CommandCounterIncrement(void) * read-only command. (But see hacks in inval.c to make real sure we * don't think a command that queued inval messages was read-only.) */ - AtCommit_LocalCache(); + AtCCI_LocalCache(); } /* @@ -1095,12 +1096,20 @@ cleanup: /* - * AtCommit_LocalCache + * AtCCI_LocalCache */ static void -AtCommit_LocalCache(void) +AtCCI_LocalCache(void) { /* + * Make any pending relation map changes visible. We must do this + * before processing local sinval messages, so that the map changes + * will get reflected into the relcache when relcache invals are + * processed. + */ + AtCCI_RelationMap(); + + /* * Make catalog changes visible to me for the next command. */ CommandEndInvalidationMessages(); @@ -1734,6 +1743,9 @@ CommitTransaction(void) /* Prevent cancel/die interrupt while cleaning up */ HOLD_INTERRUPTS(); + /* Commit updates to the relation map --- do this as late as possible */ + AtEOXact_RelationMap(true); + /* * set the current transaction state information appropriately during * commit processing @@ -1980,6 +1992,7 @@ PrepareTransaction(void) AtPrepare_Locks(); AtPrepare_PgStat(); AtPrepare_MultiXact(); + AtPrepare_RelationMap(); /* * Here is where we really truly prepare. @@ -2148,10 +2161,11 @@ AbortTransaction(void) /* * do abort processing */ - AfterTriggerEndXact(false); + AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); - AtEOXact_LargeObject(false); /* 'false' means it's abort */ + AtEOXact_LargeObject(false); AtAbort_Notify(); + AtEOXact_RelationMap(false); /* * Advertise the fact that we aborted in pg_clog (assuming that we got as @@ -4625,11 +4639,18 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec) SharedInvalidationMessage *msg = &msgs[i]; if (msg->id >= 0) - appendStringInfo(buf, "catcache id%d ", msg->id); + appendStringInfo(buf, " catcache %d", msg->id); + else if (msg->id == SHAREDINVALCATALOG_ID) + appendStringInfo(buf, " catalog %u", msg->cat.catId); else if (msg->id == SHAREDINVALRELCACHE_ID) - appendStringInfo(buf, "relcache "); + appendStringInfo(buf, " relcache %u", msg->rc.relId); + /* remaining cases not expected, but print something anyway */ else if (msg->id == SHAREDINVALSMGR_ID) - appendStringInfo(buf, "smgr "); + appendStringInfo(buf, " smgr"); + else if (msg->id == SHAREDINVALRELMAP_ID) + appendStringInfo(buf, " relmap"); + else + appendStringInfo(buf, " unknown id %d", msg->id); } } } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 49adda12f9a..f4b03f4c1be 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.366 2010/02/01 13:40:28 sriggs Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.367 2010/02/07 20:48:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,7 @@ #include "utils/builtins.h" #include "utils/guc.h" #include "utils/ps_status.h" +#include "utils/relmapper.h" #include "pg_trace.h" @@ -7123,6 +7124,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointCLOG(); CheckPointSUBTRANS(); CheckPointMultiXact(); + CheckPointRelationMap(); CheckPointBuffers(flags); /* performs all required fsyncs */ /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y index a6c1243b958..9cc68501ffc 100644 --- a/src/backend/bootstrap/bootparse.y +++ b/src/backend/bootstrap/bootparse.y @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootparse.y,v 1.104 2010/01/28 23:21:11 petere Exp $ + * $PostgreSQL: pgsql/src/backend/bootstrap/bootparse.y,v 1.105 2010/02/07 20:48:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -185,11 +185,26 @@ Boot_CreateStmt: RPAREN { TupleDesc tupdesc; + bool shared_relation; + bool mapped_relation; do_start(); tupdesc = CreateTupleDesc(numattr, !($6), attrtypes); + shared_relation = $5; + + /* + * The catalogs that use the relation mapper are the + * bootstrap catalogs plus the shared catalogs. If this + * ever gets more complicated, we should invent a BKI + * keyword to mark the mapped catalogs, but for now a + * quick hack seems the most appropriate thing. Note in + * particular that all "nailed" heap rels (see formrdesc + * in relcache.c) must be mapped. + */ + mapped_relation = ($4 || shared_relation); + if ($4) { if (boot_reldesc) @@ -200,11 +215,12 @@ Boot_CreateStmt: boot_reldesc = heap_create($2, PG_CATALOG_NAMESPACE, - $5 ? GLOBALTABLESPACE_OID : 0, + shared_relation ? GLOBALTABLESPACE_OID : 0, $3, tupdesc, RELKIND_RELATION, - $5, + shared_relation, + mapped_relation, true); elog(DEBUG4, "bootstrap relation created"); } @@ -214,7 +230,7 @@ Boot_CreateStmt: id = heap_create_with_catalog($2, PG_CATALOG_NAMESPACE, - $5 ? GLOBALTABLESPACE_OID : 0, + shared_relation ? GLOBALTABLESPACE_OID : 0, $3, $7, InvalidOid, @@ -222,7 +238,8 @@ Boot_CreateStmt: tupdesc, NIL, RELKIND_RELATION, - $5, + shared_relation, + mapped_relation, true, 0, ONCOMMIT_NOOP, diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 14e4b839e44..d2b7c1e5854 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.258 2010/01/22 16:40:18 rhaas Exp $ + * $PostgreSQL: pgsql/src/backend/bootstrap/bootstrap.c,v 1.259 2010/02/07 20:48:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,6 +42,7 @@ #include "utils/fmgroids.h" #include "utils/memutils.h" #include "utils/ps_status.h" +#include "utils/relmapper.h" #include "utils/tqual.h" extern int optind; @@ -491,6 +492,12 @@ BootstrapModeMain(void) */ boot_yyparse(); + /* + * We should now know about all mapped relations, so it's okay to + * write out the initial relation mapping files. + */ + RelationMapFinishBootstrap(); + /* Perform a checkpoint to ensure everything's down to disk */ SetProcessingMode(NormalProcessing); CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 1ce2f855100..943cc4920ec 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/catalog.c,v 1.87 2010/01/12 02:42:51 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/catalog.c,v 1.88 2010/02/07 20:48:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -460,16 +460,16 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn) * created by bootstrap have preassigned OIDs, so there's no need. */ Oid -GetNewRelFileNode(Oid reltablespace, bool relisshared, Relation pg_class) +GetNewRelFileNode(Oid reltablespace, Relation pg_class) { RelFileNode rnode; char *rpath; int fd; bool collides; - /* This should match RelationInitPhysicalAddr */ + /* This logic should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; - rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; + rnode.dbNode = (rnode.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId; do { diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index c344b8e01cb..bc232cd143b 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.369 2010/02/03 01:14:16 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/heap.c,v 1.370 2010/02/07 20:48:09 tgl Exp $ * * * INTERFACE ROUTINES @@ -237,6 +237,7 @@ heap_create(const char *relname, TupleDesc tupDesc, char relkind, bool shared_relation, + bool mapped_relation, bool allow_system_table_mods) { bool create_storage; @@ -307,7 +308,8 @@ heap_create(const char *relname, tupDesc, relid, reltablespace, - shared_relation); + shared_relation, + mapped_relation); /* * Have the storage manager create the relation's disk file, if needed. @@ -364,7 +366,8 @@ heap_create(const char *relname, * -------------------------------- */ void -CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind) +CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind, + bool allow_system_table_mods) { int i; int j; @@ -418,7 +421,8 @@ CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind) for (i = 0; i < natts; i++) { CheckAttributeType(NameStr(tupdesc->attrs[i]->attname), - tupdesc->attrs[i]->atttypid); + tupdesc->attrs[i]->atttypid, + allow_system_table_mods); } } @@ -431,7 +435,8 @@ CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind) * -------------------------------- */ void -CheckAttributeType(const char *attname, Oid atttypid) +CheckAttributeType(const char *attname, Oid atttypid, + bool allow_system_table_mods) { char att_typtype = get_typtype(atttypid); @@ -450,9 +455,11 @@ CheckAttributeType(const char *attname, Oid atttypid) { /* * Refuse any attempt to create a pseudo-type column, except for a - * special hack for pg_statistic: allow ANYARRAY during initdb + * special hack for pg_statistic: allow ANYARRAY when modifying + * system catalogs (this allows creating pg_statistic and cloning it + * during VACUUM FULL) */ - if (atttypid != ANYARRAYOID || IsUnderPostmaster) + if (atttypid != ANYARRAYOID || !allow_system_table_mods) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("column \"%s\" has pseudo-type %s", @@ -479,7 +486,8 @@ CheckAttributeType(const char *attname, Oid atttypid) if (attr->attisdropped) continue; - CheckAttributeType(NameStr(attr->attname), attr->atttypid); + CheckAttributeType(NameStr(attr->attname), attr->atttypid, + allow_system_table_mods); } relation_close(relation, AccessShareLock); @@ -865,6 +873,7 @@ AddNewRelationType(const char *typeName, * cooked_constraints: list of precooked check constraints and defaults * relkind: relkind for new rel * shared_relation: TRUE if it's to be a shared relation + * mapped_relation: TRUE if the relation will use the relfilenode map * oidislocal: TRUE if oid column (if any) should be marked attislocal * oidinhcount: attinhcount to assign to oid column (if any) * oncommit: ON COMMIT marking (only relevant if it's a temp table) @@ -888,6 +897,7 @@ heap_create_with_catalog(const char *relname, List *cooked_constraints, char relkind, bool shared_relation, + bool mapped_relation, bool oidislocal, int oidinhcount, OnCommitAction oncommit, @@ -909,7 +919,7 @@ heap_create_with_catalog(const char *relname, */ Assert(IsNormalProcessingMode() || IsBootstrapProcessingMode()); - CheckAttributeNamesTypes(tupdesc, relkind); + CheckAttributeNamesTypes(tupdesc, relkind, allow_system_table_mods); if (get_relname_relid(relname, relnamespace)) ereport(ERROR, @@ -938,23 +948,10 @@ heap_create_with_catalog(const char *relname, } /* - * Validate shared/non-shared tablespace (must check this before doing - * GetNewRelFileNode, to prevent Assert therein) + * Shared relations must be in pg_global (last-ditch check) */ - if (shared_relation) - { - if (reltablespace != GLOBALTABLESPACE_OID) - /* elog since this is not a user-facing error */ - elog(ERROR, - "shared relations must be placed in pg_global tablespace"); - } - else - { - if (reltablespace == GLOBALTABLESPACE_OID) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("only shared relations can be placed in pg_global tablespace"))); - } + if (shared_relation && reltablespace != GLOBALTABLESPACE_OID) + elog(ERROR, "shared relations must be placed in pg_global tablespace"); /* * Allocate an OID for the relation, unless we were told what to use. @@ -979,8 +976,7 @@ heap_create_with_catalog(const char *relname, binary_upgrade_next_toast_relfilenode = InvalidOid; } else - relid = GetNewRelFileNode(reltablespace, shared_relation, - pg_class_desc); + relid = GetNewRelFileNode(reltablespace, pg_class_desc); } /* @@ -1019,6 +1015,7 @@ heap_create_with_catalog(const char *relname, tupdesc, relkind, shared_relation, + mapped_relation, allow_system_table_mods); Assert(relid == RelationGetRelid(new_rel_desc)); diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index c6b6e76933f..e614d3baf6a 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.332 2010/02/03 01:14:16 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.333 2010/02/07 20:48:09 tgl Exp $ * * * INTERFACE ROUTINES @@ -111,6 +111,11 @@ static void validate_index_heapscan(Relation heapRelation, Snapshot snapshot, v_i_state *state); static Oid IndexGetRelation(Oid indexId); +static void SetReindexProcessing(Oid heapOid, Oid indexOid); +static void ResetReindexProcessing(void); +static void SetReindexPending(List *indexes); +static void RemoveReindexPending(Oid indexOid); +static void ResetReindexPending(void); /* @@ -257,7 +262,7 @@ ConstructTupleDescriptor(Relation heapRelation, * whether a table column is of a safe type (which is why we * needn't check for the non-expression case). */ - CheckAttributeType(NameStr(to->attname), to->atttypid); + CheckAttributeType(NameStr(to->attname), to->atttypid, false); } /* @@ -544,6 +549,7 @@ index_create(Oid heapRelationId, Relation indexRelation; TupleDesc indexTupDesc; bool shared_relation; + bool mapped_relation; bool is_exclusion; Oid namespaceId; int i; @@ -562,10 +568,12 @@ index_create(Oid heapRelationId, /* * The index will be in the same namespace as its parent table, and is - * shared across databases if and only if the parent is. + * shared across databases if and only if the parent is. Likewise, + * it will use the relfilenode map if and only if the parent does. */ namespaceId = RelationGetNamespace(heapRelation); shared_relation = heapRelation->rd_rel->relisshared; + mapped_relation = RelationIsMapped(heapRelation); /* * check parameters @@ -609,23 +617,10 @@ index_create(Oid heapRelationId, errmsg("shared indexes cannot be created after initdb"))); /* - * Validate shared/non-shared tablespace (must check this before doing - * GetNewRelFileNode, to prevent Assert therein) + * Shared relations must be in pg_global, too (last-ditch check) */ - if (shared_relation) - { - if (tableSpaceId != GLOBALTABLESPACE_OID) - /* elog since this is not a user-facing error */ - elog(ERROR, - "shared relations must be placed in pg_global tablespace"); - } - else - { - if (tableSpaceId == GLOBALTABLESPACE_OID) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("only shared relations can be placed in pg_global tablespace"))); - } + if (shared_relation && tableSpaceId != GLOBALTABLESPACE_OID) + elog(ERROR, "shared relations must be placed in pg_global tablespace"); if (get_relname_relid(indexRelationName, namespaceId)) ereport(ERROR, @@ -657,8 +652,7 @@ index_create(Oid heapRelationId, binary_upgrade_next_index_relfilenode = InvalidOid; } else - indexRelationId = GetNewRelFileNode(tableSpaceId, shared_relation, - pg_class); + indexRelationId = GetNewRelFileNode(tableSpaceId, pg_class); } /* @@ -673,6 +667,7 @@ index_create(Oid heapRelationId, indexTupDesc, RELKIND_INDEX, shared_relation, + mapped_relation, allow_system_table_mods); Assert(indexRelationId == RelationGetRelid(indexRelation)); @@ -2413,7 +2408,6 @@ reindex_index(Oid indexId) heapRelation, pg_index; Oid heapId; - bool inplace; IndexInfo *indexInfo; HeapTuple indexTuple; Form_pg_index indexForm; @@ -2446,23 +2440,6 @@ reindex_index(Oid indexId) */ CheckTableNotInUse(iRel, "REINDEX INDEX"); - /* - * If it's a shared index, we must do inplace processing (because we have - * no way to update relfilenode in other databases). Otherwise we can do - * it the normal transaction-safe way. - * - * Since inplace processing isn't crash-safe, we only allow it in a - * standalone backend. (In the REINDEX TABLE and REINDEX DATABASE cases, - * the caller should have detected this.) - */ - inplace = iRel->rd_rel->relisshared; - - if (inplace && IsUnderPostmaster) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("shared index \"%s\" can only be reindexed in stand-alone mode", - RelationGetRelationName(iRel)))); - PG_TRY(); { /* Suppress use of the target index while rebuilding it */ @@ -2471,20 +2448,8 @@ reindex_index(Oid indexId) /* Fetch info needed for index_build */ indexInfo = BuildIndexInfo(iRel); - if (inplace) - { - /* - * Truncate the actual file (and discard buffers). - */ - RelationTruncate(iRel, 0); - } - else - { - /* - * We'll build a new physical relation for the index. - */ - RelationSetNewRelfilenode(iRel, InvalidTransactionId); - } + /* We'll build a new physical relation for the index */ + RelationSetNewRelfilenode(iRel, InvalidTransactionId); /* Initialize the index and rebuild */ /* Note: we do not need to re-establish pkey setting */ @@ -2538,19 +2503,27 @@ reindex_index(Oid indexId) * reindex_relation - This routine is used to recreate all indexes * of a relation (and optionally its toast relation too, if any). * + * If heap_rebuilt is true, then the relation was just completely rebuilt by + * an operation such as VACUUM FULL or CLUSTER, and therefore its indexes are + * inconsistent with it. This makes things tricky if the relation is a system + * catalog that we might consult during the reindexing. To deal with that + * case, we mark all of the indexes as pending rebuild so that they won't be + * trusted until rebuilt. The caller is required to call us *without* having + * made the rebuilt versions visible by doing CommandCounterIncrement; we'll + * do CCI after having collected the index list. (This way we can still use + * catalog indexes while collecting the list.) + * * Returns true if any indexes were rebuilt. Note that a * CommandCounterIncrement will occur after each index rebuild. */ bool -reindex_relation(Oid relid, bool toast_too) +reindex_relation(Oid relid, bool toast_too, bool heap_rebuilt) { Relation rel; Oid toast_relid; + List *indexIds; bool is_pg_class; bool result; - List *indexIds, - *doneIndexes; - ListCell *indexId; /* * Open and lock the relation. ShareLock is sufficient since we only need @@ -2580,9 +2553,9 @@ reindex_relation(Oid relid, bool toast_too) * It is okay to not insert entries into the indexes we have not processed * yet because all of this is transaction-safe. If we fail partway * through, the updated rows are dead and it doesn't matter whether they - * have index entries. Also, a new pg_class index will be created with an - * entry for its own pg_class row because we do RelationSetNewRelfilenode() - * before we do index_build(). + * have index entries. Also, a new pg_class index will be created with a + * correct entry for its own pg_class row because we do + * RelationSetNewRelfilenode() before we do index_build(). * * Note that we also clear pg_class's rd_oidindex until the loop is done, * so that that index can't be accessed either. This means we cannot @@ -2595,22 +2568,51 @@ reindex_relation(Oid relid, bool toast_too) if (is_pg_class) (void) RelationGetIndexAttrBitmap(rel); - /* Reindex all the indexes. */ - doneIndexes = NIL; - foreach(indexId, indexIds) + PG_TRY(); { - Oid indexOid = lfirst_oid(indexId); + List *doneIndexes; + ListCell *indexId; - if (is_pg_class) - RelationSetIndexList(rel, doneIndexes, InvalidOid); + if (heap_rebuilt) + { + /* Suppress use of all the indexes until they are rebuilt */ + SetReindexPending(indexIds); - reindex_index(indexOid); + /* + * Make the new heap contents visible --- now things might be + * inconsistent! + */ + CommandCounterIncrement(); + } - CommandCounterIncrement(); + /* Reindex all the indexes. */ + doneIndexes = NIL; + foreach(indexId, indexIds) + { + Oid indexOid = lfirst_oid(indexId); + + if (is_pg_class) + RelationSetIndexList(rel, doneIndexes, InvalidOid); - if (is_pg_class) - doneIndexes = lappend_oid(doneIndexes, indexOid); + reindex_index(indexOid); + + CommandCounterIncrement(); + + if (heap_rebuilt) + RemoveReindexPending(indexOid); + + if (is_pg_class) + doneIndexes = lappend_oid(doneIndexes, indexOid); + } + } + PG_CATCH(); + { + /* Make sure list gets cleared on error exit */ + ResetReindexPending(); + PG_RE_THROW(); } + PG_END_TRY(); + ResetReindexPending(); if (is_pg_class) RelationSetIndexList(rel, indexIds, ClassOidIndexId); @@ -2627,7 +2629,107 @@ reindex_relation(Oid relid, bool toast_too) * still hold the lock on the master table. */ if (toast_too && OidIsValid(toast_relid)) - result |= reindex_relation(toast_relid, false); + result |= reindex_relation(toast_relid, false, false); return result; } + + +/* ---------------------------------------------------------------- + * System index reindexing support + * + * When we are busy reindexing a system index, this code provides support + * for preventing catalog lookups from using that index. + * ---------------------------------------------------------------- + */ + +static Oid currentlyReindexedHeap = InvalidOid; +static Oid currentlyReindexedIndex = InvalidOid; +static List *pendingReindexedIndexes = NIL; + +/* + * ReindexIsProcessingHeap + * True if heap specified by OID is currently being reindexed. + */ +bool +ReindexIsProcessingHeap(Oid heapOid) +{ + return heapOid == currentlyReindexedHeap; +} + +/* + * ReindexIsProcessingIndex + * True if index specified by OID is currently being reindexed, + * or should be treated as invalid because it is awaiting reindex. + */ +bool +ReindexIsProcessingIndex(Oid indexOid) +{ + return indexOid == currentlyReindexedIndex || + list_member_oid(pendingReindexedIndexes, indexOid); +} + +/* + * SetReindexProcessing + * Set flag that specified heap/index are being reindexed. + * + * NB: caller must use a PG_TRY block to ensure ResetReindexProcessing is done. + */ +static void +SetReindexProcessing(Oid heapOid, Oid indexOid) +{ + Assert(OidIsValid(heapOid) && OidIsValid(indexOid)); + /* Reindexing is not re-entrant. */ + if (OidIsValid(currentlyReindexedHeap)) + elog(ERROR, "cannot reindex while reindexing"); + currentlyReindexedHeap = heapOid; + currentlyReindexedIndex = indexOid; +} + +/* + * ResetReindexProcessing + * Unset reindexing status. + */ +static void +ResetReindexProcessing(void) +{ + currentlyReindexedHeap = InvalidOid; + currentlyReindexedIndex = InvalidOid; +} + +/* + * SetReindexPending + * Mark the given indexes as pending reindex. + * + * NB: caller must use a PG_TRY block to ensure ResetReindexPending is done. + * Also, we assume that the current memory context stays valid throughout. + */ +static void +SetReindexPending(List *indexes) +{ + /* Reindexing is not re-entrant. */ + if (pendingReindexedIndexes) + elog(ERROR, "cannot reindex while reindexing"); + pendingReindexedIndexes = list_copy(indexes); +} + +/* + * RemoveReindexPending + * Remove the given index from the pending list. + */ +static void +RemoveReindexPending(Oid indexOid) +{ + pendingReindexedIndexes = list_delete_oid(pendingReindexedIndexes, + indexOid); +} + +/* + * ResetReindexPending + * Unset reindex-pending status. + */ +static void +ResetReindexPending(void) +{ + pendingReindexedIndexes = NIL; +} diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 492d55fbcf5..e087b653b92 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/storage.c,v 1.7 2010/01/02 16:57:36 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/storage.c,v 1.8 2010/02/07 20:48:09 tgl Exp $ * * NOTES * Some of this code used to be in storage/smgr/smgr.c, and the @@ -109,8 +109,7 @@ RelationCreateStorage(RelFileNode rnode, bool istemp) if (!istemp) { /* - * Make an XLOG entry showing the file creation. If we abort, the - * file will be dropped at abort time. + * Make an XLOG entry reporting the file creation. */ xlrec.rnode = rnode; @@ -166,6 +165,52 @@ RelationDropStorage(Relation rel) } /* + * RelationPreserveStorage + * Mark a relation as not to be deleted after all. + * + * We need this function because relation mapping changes are committed + * separately from commit of the whole transaction, so it's still possible + * for the transaction to abort after the mapping update is done. + * When a new physical relation is installed in the map, it would be + * scheduled for delete-on-abort, so we'd delete it, and be in trouble. + * The relation mapper fixes this by telling us to not delete such relations + * after all as part of its commit. + * + * No-op if the relation is not among those scheduled for deletion. + */ +void +RelationPreserveStorage(RelFileNode rnode) +{ + PendingRelDelete *pending; + PendingRelDelete *prev; + PendingRelDelete *next; + + prev = NULL; + for (pending = pendingDeletes; pending != NULL; pending = next) + { + next = pending->next; + if (RelFileNodeEquals(rnode, pending->relnode)) + { + /* we should only find delete-on-abort entries, else trouble */ + if (pending->atCommit) + elog(ERROR, "cannot preserve a delete-on-commit relation"); + /* unlink and delete list entry */ + if (prev) + prev->next = next; + else + pendingDeletes = next; + pfree(pending); + /* prev does not change */ + } + else + { + /* unrelated entry, don't touch it */ + prev = pending; + } + } +} + +/* * RelationTruncate * Physically truncate a relation to the specified number of blocks. * @@ -200,13 +245,13 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * likely isn't going to succeed in the truncation either, and cause a * PANIC. It's tempting to put a critical section here, but that cure * would be worse than the disease. It would turn a usually harmless - * failure to truncate, that could spell trouble at WAL replay, into a + * failure to truncate, that might spell trouble at WAL replay, into a * certain PANIC. */ if (!rel->rd_istemp) { /* - * Make an XLOG entry showing the file truncation. + * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; XLogRecData rdata; @@ -270,10 +315,8 @@ smgrDoPendingDeletes(bool isCommit) /* do deletion if called for */ if (pending->atCommit == isCommit) { - int i; - - /* schedule unlinking old files */ SMgrRelation srel; + int i; srel = smgropen(pending->relnode); for (i = 0; i <= MAX_FORKNUM; i++) @@ -440,7 +483,6 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record) FreeSpaceMapTruncateRel(rel, xlrec->blkno); FreeFakeRelcacheEntry(rel); } - } else elog(PANIC, "smgr_redo: unknown op code %u", info); diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index ca70f19bf32..35bade50ea8 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.29 2010/02/03 01:14:16 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.30 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -114,6 +114,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio HeapTuple reltup; TupleDesc tupdesc; bool shared_relation; + bool mapped_relation; Relation class_rel; Oid toast_relid; Oid toast_idxid; @@ -139,6 +140,9 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("shared tables cannot be toasted after initdb"))); + /* It's mapped if and only if its parent is, too */ + mapped_relation = RelationIsMapped(rel); + /* * Is it already toasted? */ @@ -148,7 +152,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio /* * Check to see whether the table actually needs a TOAST table. * - * If an update-in-place relfilenode is specified, force toast file + * If an update-in-place toast relfilenode is specified, force toast file * creation even if it seems not to need one. */ if (!needs_toast_table(rel) && @@ -213,6 +217,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptio NIL, RELKIND_TOASTVALUE, shared_relation, + mapped_relation, true, 0, ONCOMMIT_NOOP, diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index cf2ac19d533..da605bffacf 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * cluster.c - * CLUSTER a table on an index. + * CLUSTER a table on an index. This is now also used for VACUUM FULL. * * There is hardly anything left of Paul Brown's original implementation... * @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.197 2010/02/04 00:09:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.198 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -44,6 +44,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/relcache.h" +#include "utils/relmapper.h" #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -223,7 +224,8 @@ cluster(ClusterStmt *stmt, bool isTopLevel) StartTransactionCommand(); /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); - cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose, -1, -1); + cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose, + -1, -1); PopActiveSnapshot(); CommitTransactionCommand(); } @@ -245,13 +247,13 @@ cluster(ClusterStmt *stmt, bool isTopLevel) * GRANT, inheritance nor references to this table (this was a bug * in releases thru 7.3). * - * Also create new indexes and swap the filenodes with the old indexes the - * same way we do for the relation. Since we are effectively bulk-loading + * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading * the new table, it's better to create the indexes afterwards than to fill * them incrementally while we load the table. * * If indexOid is InvalidOid, the table will be rewritten in physical order - * instead of index order. + * instead of index order. This is the new implementation of VACUUM FULL, + * and error messages should refer to the operation as VACUUM not CLUSTER. */ void cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose, @@ -300,8 +302,7 @@ cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose, * somebody is executing a database-wide CLUSTER), because there is * another check in cluster() which will stop any attempt to cluster * remote temp tables by name. There is another check in - * check_index_is_clusterable which is redundant, but we leave it for - * extra safety. + * cluster_rel which is redundant, but we leave it for extra safety. */ if (RELATION_IS_OTHER_TEMP(OldHeap)) { @@ -344,10 +345,44 @@ cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose, } } + /* + * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER + * would work in most respects, but the index would only get marked as + * indisclustered in the current database, leading to unexpected behavior + * if CLUSTER were later invoked in another database. + */ + if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster a shared catalog"))); + + /* + * Don't process temp tables of other backends ... their local + * buffer manager is not going to cope. + */ + if (RELATION_IS_OTHER_TEMP(OldHeap)) + { + if (OidIsValid(indexOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster temporary tables of other sessions"))); + else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot vacuum temporary tables of other sessions"))); + } + + /* + * Also check for active uses of the relation in the current transaction, + * including open scans and pending AFTER trigger events. + */ + CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM"); + /* Check heap and index are valid to cluster on */ - check_index_is_clusterable(OldHeap, indexOid, recheck); + if (OidIsValid(indexOid)) + check_index_is_clusterable(OldHeap, indexOid, recheck); - /* rebuild_relation does all the dirty work */ + /* Log what we're doing (this could use more effort) */ if (OidIsValid(indexOid)) ereport(verbose ? INFO : DEBUG2, (errmsg("clustering \"%s.%s\"", @@ -358,6 +393,8 @@ cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(OldHeap)), RelationGetRelationName(OldHeap)))); + + /* rebuild_relation does all the dirty work */ rebuild_relation(OldHeap, indexOid, freeze_min_age, freeze_table_age); /* NB: rebuild_relation does heap_close() on OldHeap */ @@ -376,38 +413,6 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck) { Relation OldIndex; - /* - * Disallow clustering system relations. This will definitely NOT work - * for shared relations (we have no way to update pg_class rows in other - * databases), nor for nailed-in-cache relations (the relfilenode values - * for those are hardwired, see relcache.c). It might work for other - * system relations, but I ain't gonna risk it. - */ - if (IsSystemRelation(OldHeap)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("\"%s\" is a system catalog", - RelationGetRelationName(OldHeap)))); - - /* - * Don't allow cluster on temp tables of other backends ... their local - * buffer manager is not going to cope. - */ - if (RELATION_IS_OTHER_TEMP(OldHeap)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster temporary tables of other sessions"))); - - /* - * Also check for active uses of the relation in the current transaction, - * including open scans and pending AFTER trigger events. - */ - CheckTableNotInUse(OldHeap, "CLUSTER"); - - /* Skip checks for index if not specified. */ - if (!OidIsValid(indexOid)) - return; - OldIndex = index_open(indexOid, AccessExclusiveLock); /* @@ -421,6 +426,13 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck) RelationGetRelationName(OldIndex), RelationGetRelationName(OldHeap)))); + /* Index AM must allow clustering */ + if (!OldIndex->rd_am->amclusterable) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster on index \"%s\" because access method does not support clustering", + RelationGetRelationName(OldIndex)))); + /* * Disallow clustering on incomplete indexes (those that might not index * every row of the relation). We could relax this by making a separate @@ -433,12 +445,6 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck) errmsg("cannot cluster on partial index \"%s\"", RelationGetRelationName(OldIndex)))); - if (!OldIndex->rd_am->amclusterable) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot cluster on index \"%s\" because access method does not support clustering", - RelationGetRelationName(OldIndex)))); - if (!OldIndex->rd_am->amindexnulls) { AttrNumber colno; @@ -585,6 +591,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, Oid tableOid = RelationGetRelid(OldHeap); Oid tableSpace = OldHeap->rd_rel->reltablespace; Oid OIDNewHeap; + bool is_system_catalog; bool swap_toast_by_content; TransactionId frozenXid; @@ -592,6 +599,9 @@ rebuild_relation(Relation OldHeap, Oid indexOid, if (OidIsValid(indexOid)) mark_index_clustered(OldHeap, indexOid); + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(OldHeap); + /* Close relcache entry, but keep lock until transaction commit */ heap_close(OldHeap, NoLock); @@ -603,12 +613,12 @@ rebuild_relation(Relation OldHeap, Oid indexOid, freeze_min_age, freeze_table_age, &swap_toast_by_content, &frozenXid); - /* Swap the physical files of the old and new heaps */ - swap_relation_files(tableOid, OIDNewHeap, - swap_toast_by_content, frozenXid); - - /* Destroy the new heap, removing the old data along with it */ - cleanup_heap_swap(tableOid, OIDNewHeap, swap_toast_by_content); + /* + * Swap the physical files of the target and transient tables, then + * rebuild the target's indexes and throw away the transient table. + */ + finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, + swap_toast_by_content, frozenXid); } @@ -619,8 +629,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, * NewTableSpace which might be different from OldHeap's. * * After this, the caller should load the new heap with transferred/modified - * data, then call swap_relation_files, and finally call cleanup_heap_swap to - * remove the debris. + * data, then call finish_heap_swap to complete the operation. */ Oid make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) @@ -666,6 +675,11 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) * relnames. Working around this seems more trouble than it's worth; in * particular, we can't create the new heap in a different namespace from * the old, or we will have problems with the TEMP status of temp tables. + * + * Note: the new heap is not a shared relation, even if we are rebuilding + * a shared rel. However, we do make the new heap mapped if the source + * is mapped. This simplifies swap_relation_files, and is absolutely + * necessary for rebuilding pg_class, for reasons explained there. */ snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap); @@ -679,13 +693,14 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) tupdesc, NIL, OldHeap->rd_rel->relkind, - OldHeap->rd_rel->relisshared, + false, + RelationIsMapped(OldHeap), true, 0, ONCOMMIT_NOOP, reloptions, false, - allowSystemTableMods); + true); ReleaseSysCache(tuple); @@ -696,14 +711,20 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) CommandCounterIncrement(); /* - * If necessary, create a TOAST table for the new relation. Note that - * AlterTableCreateToastTable ends with CommandCounterIncrement(), so that - * the TOAST table will be visible for insertion. + * If necessary, create a TOAST table for the new relation. + * + * If the relation doesn't have a TOAST table already, we can't need one + * for the new relation. The other way around is possible though: if + * some wide columns have been dropped, AlterTableCreateToastTable + * can decide that no TOAST table is needed for the new table. + * + * Note that AlterTableCreateToastTable ends with CommandCounterIncrement, + * so that the TOAST table will be visible for insertion. */ toastid = OldHeap->rd_rel->reltoastrelid; - reloptions = (Datum) 0; if (OidIsValid(toastid)) { + /* keep the existing toast table's reloptions, if any */ tuple = SearchSysCache(RELOID, ObjectIdGetDatum(toastid), 0, 0, 0); @@ -713,11 +734,11 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace) &isNull); if (isNull) reloptions = (Datum) 0; - } - AlterTableCreateToastTable(OIDNewHeap, reloptions); - if (OidIsValid(toastid)) + AlterTableCreateToastTable(OIDNewHeap, reloptions); + ReleaseSysCache(tuple); + } heap_close(OldHeap, NoLock); @@ -747,6 +768,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, IndexScanDesc indexScan; HeapScanDesc heapScan; bool use_wal; + bool is_system_catalog; TransactionId OldestXmin; TransactionId FreezeXid; RewriteState rwstate; @@ -786,9 +808,14 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, */ if (!use_wal && !NewHeap->rd_istemp) { - char reason[NAMEDATALEN + 20]; - snprintf(reason, sizeof(reason), "CLUSTER on \"%s\"", - RelationGetRelationName(NewHeap)); + char reason[NAMEDATALEN + 32]; + + if (OldIndex != NULL) + snprintf(reason, sizeof(reason), "CLUSTER on \"%s\"", + RelationGetRelationName(NewHeap)); + else + snprintf(reason, sizeof(reason), "VACUUM FULL on \"%s\"", + RelationGetRelationName(NewHeap)); XLogReportUnloggedStatement(reason); } @@ -841,6 +868,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, /* return selected value to caller */ *pFreezeXid = FreezeXid; + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(OldHeap); + /* Initialize the rewrite operation */ rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal); @@ -909,25 +939,31 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, case HEAPTUPLE_INSERT_IN_PROGRESS: /* - * We should not see this unless it's been inserted earlier in - * our own transaction. + * Since we hold exclusive lock on the relation, normally + * the only way to see this is if it was inserted earlier + * in our own transaction. However, it can happen in system + * catalogs, since we tend to release write lock before commit + * there. Give a warning if neither case applies; but in + * any case we had better copy it. */ - if (!TransactionIdIsCurrentTransactionId( - HeapTupleHeaderGetXmin(tuple->t_data))) - elog(ERROR, "concurrent insert in progress"); + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) + + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); /* treat as live */ isdead = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* - * We should not see this unless it's been deleted earlier in - * our own transaction. + * Similar situation to INSERT_IN_PROGRESS case. */ Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); - if (!TransactionIdIsCurrentTransactionId( - HeapTupleHeaderGetXmax(tuple->t_data))) - elog(ERROR, "concurrent delete in progress"); + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple->t_data))) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); /* treat as recently dead */ isdead = false; break; @@ -1016,21 +1052,29 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, * table is added or removed altogether. * * Additionally, the first relation is marked with relfrozenxid set to - * frozenXid. It seems a bit ugly to have this here, but all callers would + * frozenXid. It seems a bit ugly to have this here, but the caller would * have to do it anyway, so having it here saves a heap_update. Note: in * the swap-toast-links case, we assume we don't need to change the toast * table's relfrozenxid: the new version of the toast table should already * have relfrozenxid set to RecentXmin, which is good enough. + * + * Lastly, if r2 and its toast table and toast index (if any) are mapped, + * their OIDs are emitted into mapped_tables[]. This is hacky but beats + * having to look the information up again later in finish_heap_swap. */ -void -swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, - TransactionId frozenXid) +static void +swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, + bool swap_toast_by_content, + TransactionId frozenXid, + Oid *mapped_tables) { Relation relRelation; HeapTuple reltup1, reltup2; Form_pg_class relform1, relform2; + Oid relfilenode1, + relfilenode2; Oid swaptemp; CatalogIndexState indstate; @@ -1051,29 +1095,86 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, elog(ERROR, "cache lookup failed for relation %u", r2); relform2 = (Form_pg_class) GETSTRUCT(reltup2); - /* - * Actually swap the fields in the two tuples - */ - swaptemp = relform1->relfilenode; - relform1->relfilenode = relform2->relfilenode; - relform2->relfilenode = swaptemp; + relfilenode1 = relform1->relfilenode; + relfilenode2 = relform2->relfilenode; - swaptemp = relform1->reltablespace; - relform1->reltablespace = relform2->reltablespace; - relform2->reltablespace = swaptemp; + if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2)) + { + /* Normal non-mapped relations: swap relfilenodes and reltablespaces */ + Assert(!target_is_pg_class); - if (!swap_toast_by_content) + swaptemp = relform1->relfilenode; + relform1->relfilenode = relform2->relfilenode; + relform2->relfilenode = swaptemp; + + swaptemp = relform1->reltablespace; + relform1->reltablespace = relform2->reltablespace; + relform2->reltablespace = swaptemp; + + /* Also swap toast links, if we're swapping by links */ + if (!swap_toast_by_content) + { + swaptemp = relform1->reltoastrelid; + relform1->reltoastrelid = relform2->reltoastrelid; + relform2->reltoastrelid = swaptemp; + + /* we should NOT swap reltoastidxid */ + } + } + else { - swaptemp = relform1->reltoastrelid; - relform1->reltoastrelid = relform2->reltoastrelid; - relform2->reltoastrelid = swaptemp; + /* + * Mapped-relation case. Here we have to swap the relation mappings + * instead of modifying the pg_class columns. Both must be mapped. + */ + if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2)) + elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation", + NameStr(relform1->relname)); + + /* + * We can't change the tablespace of a mapped rel, and we can't handle + * toast link swapping for one either, because we must not apply any + * critical changes to its pg_class row. These cases should be + * prevented by upstream permissions tests, so this check is a + * non-user-facing emergency backstop. + */ + if (relform1->reltablespace != relform2->reltablespace) + elog(ERROR, "cannot change tablespace of mapped relation \"%s\"", + NameStr(relform1->relname)); + if (!swap_toast_by_content && + (relform1->reltoastrelid || relform2->reltoastrelid)) + elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"", + NameStr(relform1->relname)); - /* we should not swap reltoastidxid */ + /* + * Fetch the mappings --- shouldn't fail, but be paranoid + */ + relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared); + if (!OidIsValid(relfilenode1)) + elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", + NameStr(relform1->relname), r1); + relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared); + if (!OidIsValid(relfilenode2)) + elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", + NameStr(relform2->relname), r2); + + /* + * Send replacement mappings to relmapper. Note these won't actually + * take effect until CommandCounterIncrement. + */ + RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false); + RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false); + + /* Pass OIDs of mapped r2 tables back to caller */ + *mapped_tables++ = r2; } /* - * In the case of a shared catalog, these next few steps only affect our - * own database's pg_class row; but that's okay. + * In the case of a shared catalog, these next few steps will only affect + * our own database's pg_class row; but that's okay, because they are + * all noncritical updates. That's also an important fact for the case + * of a mapped catalog, because it's possible that we'll commit the map + * change and then fail to commit the pg_class update. */ /* set rel1's frozen Xid */ @@ -1097,15 +1198,31 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, relform2->reltuples = swap_tuples; } - /* Update the tuples in pg_class */ - simple_heap_update(relRelation, &reltup1->t_self, reltup1); - simple_heap_update(relRelation, &reltup2->t_self, reltup2); - - /* Keep system catalogs current */ - indstate = CatalogOpenIndexes(relRelation); - CatalogIndexInsert(indstate, reltup1); - CatalogIndexInsert(indstate, reltup2); - CatalogCloseIndexes(indstate); + /* + * Update the tuples in pg_class --- unless the target relation of the + * swap is pg_class itself. In that case, there is zero point in making + * changes because we'd be updating the old data that we're about to + * throw away. Because the real work being done here for a mapped relation + * is just to change the relation map settings, it's all right to not + * update the pg_class rows in this case. + */ + if (!target_is_pg_class) + { + simple_heap_update(relRelation, &reltup1->t_self, reltup1); + simple_heap_update(relRelation, &reltup2->t_self, reltup2); + + /* Keep system catalogs current */ + indstate = CatalogOpenIndexes(relRelation); + CatalogIndexInsert(indstate, reltup1); + CatalogIndexInsert(indstate, reltup2); + CatalogCloseIndexes(indstate); + } + else + { + /* no update ... but we do still need relcache inval */ + CacheInvalidateRelcacheByTuple(reltup1); + CacheInvalidateRelcacheByTuple(reltup2); + } /* * If we have toast tables associated with the relations being swapped, @@ -1120,8 +1237,10 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, /* Recursively swap the contents of the toast tables */ swap_relation_files(relform1->reltoastrelid, relform2->reltoastrelid, - true, - frozenXid); + target_is_pg_class, + swap_toast_by_content, + frozenXid, + mapped_tables); } else { @@ -1146,6 +1265,15 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, toastobject; long count; + /* + * We disallow this case for system catalogs, to avoid the + * possibility that the catalog we're rebuilding is one of the + * ones the dependency changes would change. It's too late + * to be making any data changes to the target catalog. + */ + if (IsSystemClass(relform1)) + elog(ERROR, "cannot swap toast files by links for system catalogs"); + /* Delete old dependencies */ if (relform1->reltoastrelid) { @@ -1196,30 +1324,35 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, relform1->reltoastidxid && relform2->reltoastidxid) swap_relation_files(relform1->reltoastidxid, relform2->reltoastidxid, - true, - InvalidTransactionId); - - /* - * Blow away the old relcache entries now. We need this kluge because - * relcache.c keeps a link to the smgr relation for the physical file, and - * that will be out of date as soon as we do CommandCounterIncrement. - * Whichever of the rels is the second to be cleared during cache - * invalidation will have a dangling reference to an already-deleted smgr - * relation. Rather than trying to avoid this by ordering operations just - * so, it's easiest to not have the relcache entries there at all. - * (Fortunately, since one of the entries is local in our transaction, - * it's sufficient to clear out our own relcache this way; the problem - * cannot arise for other backends when they see our update on the - * non-local relation.) - */ - RelationForgetRelation(r1); - RelationForgetRelation(r2); + target_is_pg_class, + swap_toast_by_content, + InvalidTransactionId, + mapped_tables); /* Clean up. */ heap_freetuple(reltup1); heap_freetuple(reltup2); heap_close(relRelation, RowExclusiveLock); + + /* + * Close both relcache entries' smgr links. We need this kluge because + * both links will be invalidated during upcoming CommandCounterIncrement. + * Whichever of the rels is the second to be cleared will have a dangling + * reference to the other's smgr entry. Rather than trying to avoid this + * by ordering operations just so, it's easiest to close the links first. + * (Fortunately, since one of the entries is local in our transaction, + * it's sufficient to clear out our own relcache this way; the problem + * cannot arise for other backends when they see our update on the + * non-transient relation.) + * + * Caution: the placement of this step interacts with the decision to + * handle toast rels by recursion. When we are trying to rebuild pg_class + * itself, the smgr close on pg_class must happen after all accesses in + * this function. + */ + RelationCloseSmgrByOid(r1); + RelationCloseSmgrByOid(r2); } /* @@ -1227,12 +1360,43 @@ swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, * cleaning up (including rebuilding all indexes on the old heap). */ void -cleanup_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool swap_toast_by_content) +finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, + bool is_system_catalog, + bool swap_toast_by_content, + TransactionId frozenXid) { ObjectAddress object; + Oid mapped_tables[4]; + int i; - /* Make swap_relation_files' changes visible in the catalogs. */ - CommandCounterIncrement(); + /* Zero out possible results from swapped_relation_files */ + memset(mapped_tables, 0, sizeof(mapped_tables)); + + /* + * Swap the contents of the heap relations (including any toast tables). + * Also set old heap's relfrozenxid to frozenXid. + */ + swap_relation_files(OIDOldHeap, OIDNewHeap, + (OIDOldHeap == RelationRelationId), + swap_toast_by_content, frozenXid, mapped_tables); + + /* + * If it's a system catalog, queue an sinval message to flush all + * catcaches on the catalog when we reach CommandCounterIncrement. + */ + if (is_system_catalog) + CacheInvalidateCatalog(OIDOldHeap); + + /* + * Rebuild each index on the relation (but not the toast table, which is + * all-new at this point). It is important to do this before the DROP + * step because if we are processing a system catalog that will be used + * during DROP, we want to have its indexes available. There is no + * advantage to the other order anyway because this is all transactional, + * so no chance to reclaim disk space before commit. We do not need + * a final CommandCounterIncrement() because reindex_relation does it. + */ + reindex_relation(OIDOldHeap, false, true); /* Destroy new heap with old filenode */ object.classId = RelationRelationId; @@ -1248,11 +1412,13 @@ cleanup_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool swap_toast_by_content) /* performDeletion does CommandCounterIncrement at end */ /* - * Rebuild each index on the relation (but not the toast table, which is - * all-new at this point). We do not need CommandCounterIncrement() - * because reindex_relation does it. + * Now we must remove any relation mapping entries that we set up for the + * transient table, as well as its toast table and toast index if any. + * If we fail to do this before commit, the relmapper will complain about + * new permanent map entries being added post-bootstrap. */ - reindex_relation(OIDOldHeap, false); + for (i = 0; OidIsValid(mapped_tables[i]); i++) + RelationMapRemoveMapping(mapped_tables[i]); /* * At this point, everything is kosher except that, if we did toast swap diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index bbec82f2076..7e6be57ee82 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.190 2010/01/02 16:57:37 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.191 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -244,10 +244,15 @@ DefineIndex(RangeVar *heapRelation, /* * Force shared indexes into the pg_global tablespace. This is a bit of a - * hack but seems simpler than marking them in the BKI commands. + * hack but seems simpler than marking them in the BKI commands. On the + * other hand, if it's not shared, don't allow it to be placed there. */ if (rel->rd_rel->relisshared) tablespaceId = GLOBALTABLESPACE_OID; + else if (tablespaceId == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("only shared relations can be placed in pg_global tablespace"))); /* * Choose the index column names. @@ -1615,16 +1620,9 @@ ReindexTable(RangeVar *relation) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, relation->relname); - /* Can't reindex shared tables except in standalone mode */ - if (((Form_pg_class) GETSTRUCT(tuple))->relisshared && IsUnderPostmaster) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("shared table \"%s\" can only be reindexed in stand-alone mode", - relation->relname))); - ReleaseSysCache(tuple); - if (!reindex_relation(heapOid, true)) + if (!reindex_relation(heapOid, true, false)) ereport(NOTICE, (errmsg("table \"%s\" has no indexes", relation->relname))); @@ -1717,12 +1715,6 @@ ReindexDatabase(const char *databaseName, bool do_system, bool do_user) continue; } - if (IsUnderPostmaster) /* silently ignore shared tables */ - { - if (classtuple->relisshared) - continue; - } - if (HeapTupleGetOid(tuple) == RelationRelationId) continue; /* got it already */ @@ -1743,7 +1735,7 @@ ReindexDatabase(const char *databaseName, bool do_system, bool do_user) StartTransactionCommand(); /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); - if (reindex_relation(relid, true)) + if (reindex_relation(relid, true, false)) ereport(NOTICE, (errmsg("table \"%s\" was reindexed", get_rel_name(relid)))); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index dba5f29d661..683c7f58d81 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.324 2010/02/04 00:09:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.325 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -436,6 +436,12 @@ DefineRelation(CreateStmt *stmt, char relkind) get_tablespace_name(tablespaceId)); } + /* In all cases disallow placing user relations in pg_global */ + if (tablespaceId == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("only shared relations can be placed in pg_global tablespace"))); + /* * Parse and validate reloptions, if any. */ @@ -534,6 +540,7 @@ DefineRelation(CreateStmt *stmt, char relkind) old_constraints), relkind, false, + false, localHasOids, parentOidCount, stmt->oncommit, @@ -1014,7 +1021,7 @@ ExecuteTruncate(TruncateStmt *stmt) /* * Reconstruct the indexes to match, and we're done. */ - reindex_relation(heap_relid, true); + reindex_relation(heap_relid, true, false); } } @@ -1092,16 +1099,6 @@ truncate_check_rel(Relation rel) RelationGetRelationName(rel)))); /* - * We can never allow truncation of shared or nailed-in-cache relations, - * because we can't support changing their relfilenode values. - */ - if (rel->rd_rel->relisshared || rel->rd_isnailed) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot truncate system relation \"%s\"", - RelationGetRelationName(rel)))); - - /* * Don't allow truncate on temp tables of other backends ... their local * buffer manager is not going to cope. */ @@ -2873,11 +2870,11 @@ ATRewriteTables(List **wqueue) OldHeap = heap_open(tab->relid, NoLock); /* - * We can never allow rewriting of shared or nailed-in-cache - * relations, because we can't support changing their relfilenode - * values. + * We don't support rewriting of system catalogs; there are + * too many corner cases and too little benefit. In particular + * this is certainly not going to work for mapped catalogs. */ - if (OldHeap->rd_rel->relisshared || OldHeap->rd_isnailed) + if (IsSystemRelation(OldHeap)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot rewrite system relation \"%s\"", @@ -2914,17 +2911,14 @@ ATRewriteTables(List **wqueue) ATRewriteTable(tab, OIDNewHeap); /* - * Swap the physical files of the old and new heaps. Since we are - * generating a new heap, we can use RecentXmin for the table's - * new relfrozenxid because we rewrote all the tuples on - * ATRewriteTable, so no older Xid remains in the table. Also, - * we never try to swap toast tables by content, since we have - * no interest in letting this code work on system catalogs. + * Swap the physical files of the old and new heaps, then rebuild + * indexes and discard the new heap. We can use RecentXmin for + * the table's new relfrozenxid because we rewrote all the tuples + * in ATRewriteTable, so no older Xid remains in the table. Also, + * we never try to swap toast tables by content, since we have no + * interest in letting this code work on system catalogs. */ - swap_relation_files(tab->relid, OIDNewHeap, false, RecentXmin); - - /* Destroy the new heap, removing the old data along with it. */ - cleanup_heap_swap(tab->relid, OIDNewHeap, false); + finish_heap_swap(tab->relid, OIDNewHeap, false, false, RecentXmin); } else { @@ -3715,7 +3709,7 @@ ATExecAddColumn(AlteredTableInfo *tab, Relation rel, typeOid = HeapTupleGetOid(typeTuple); /* make sure datatype is legal for a column */ - CheckAttributeType(colDef->colname, typeOid); + CheckAttributeType(colDef->colname, typeOid, false); /* construct new attribute's pg_attribute entry */ attribute.attrelid = myrelid; @@ -5825,7 +5819,7 @@ ATPrepAlterColumnType(List **wqueue, targettype = typenameTypeId(NULL, typeName, &targettypmod); /* make sure datatype is legal for a column */ - CheckAttributeType(colName, targettype); + CheckAttributeType(colName, targettype, false); /* * Set up an expression to transform the old data value to the new type. @@ -6925,10 +6919,21 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) rel = relation_open(tableOid, AccessExclusiveLock); /* - * We can never allow moving of shared or nailed-in-cache relations, - * because we can't support changing their reltablespace values. + * No work if no change in tablespace. + */ + oldTableSpace = rel->rd_rel->reltablespace; + if (newTableSpace == oldTableSpace || + (newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0)) + { + relation_close(rel, NoLock); + return; + } + + /* + * We cannot support moving mapped relations into different tablespaces. + * (In particular this eliminates all shared catalogs.) */ - if (rel->rd_rel->relisshared || rel->rd_isnailed) + if (RelationIsMapped(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot move system relation \"%s\"", @@ -6949,17 +6954,6 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot move temporary tables of other sessions"))); - /* - * No work if no change in tablespace. - */ - oldTableSpace = rel->rd_rel->reltablespace; - if (newTableSpace == oldTableSpace || - (newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0)) - { - relation_close(rel, NoLock); - return; - } - reltoastrelid = rel->rd_rel->reltoastrelid; reltoastidxid = rel->rd_rel->reltoastidxid; @@ -6985,9 +6979,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace) * Relfilenodes are not unique across tablespaces, so we need to allocate * a new one in the new tablespace. */ - newrelfilenode = GetNewRelFileNode(newTableSpace, - rel->rd_rel->relisshared, - NULL); + newrelfilenode = GetNewRelFileNode(newTableSpace, NULL); /* Open old and new relation */ newrnode = rel->rd_node; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 850680950e5..e18ed084b41 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.403 2010/01/06 05:31:13 itagaki Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.404 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1183,11 +1183,10 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, /* * Do the actual work --- either FULL, FULL INPLACE, or "lazy" vacuum. - * We can use only FULL INPLACE vacuum for system relations. */ if (!(vacstmt->options & VACOPT_FULL)) heldoff = lazy_vacuum_rel(onerel, vacstmt, vac_strategy, scanned_all); - else if ((vacstmt->options & VACOPT_INPLACE) || IsSystemRelation(onerel)) + else if (vacstmt->options & VACOPT_INPLACE) heldoff = full_vacuum_rel(onerel, vacstmt); else { @@ -1196,8 +1195,8 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound, onerel = NULL; cluster_rel(relid, InvalidOid, false, - (vacstmt->options & VACOPT_VERBOSE) != 0, - vacstmt->freeze_min_age, vacstmt->freeze_table_age); + (vacstmt->options & VACOPT_VERBOSE) != 0, + vacstmt->freeze_min_age, vacstmt->freeze_table_age); heldoff = false; } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 5a8af0b2f75..85566b77dcd 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -26,7 +26,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.344 2010/02/03 10:01:30 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.345 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2168,6 +2168,7 @@ OpenIntoRel(QueryDesc *queryDesc) NIL, RELKIND_RELATION, false, + false, true, 0, into->onCommit, diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 0f615e674c0..54c5cb39e8e 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.195 2010/01/02 16:57:49 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.196 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -609,7 +609,7 @@ transformRangeFunction(ParseState *pstate, RangeFunction *r) tupdesc = BuildDescFromLists(rte->eref->colnames, rte->funccoltypes, rte->funccoltypmods); - CheckAttributeNamesTypes(tupdesc, RELKIND_COMPOSITE_TYPE); + CheckAttributeNamesTypes(tupdesc, RELKIND_COMPOSITE_TYPE, false); } return rte; diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index d2d91975872..d894e8906c8 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -1,11 +1,11 @@ /* * dbsize.c - * object size functions + * Database object size functions, and related inquiries * * Copyright (c) 2002-2010, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/dbsize.c,v 1.28 2010/01/23 21:29:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/dbsize.c,v 1.29 2010/02/07 20:48:10 tgl Exp $ * */ @@ -25,6 +25,7 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/rel.h" +#include "utils/relmapper.h" #include "utils/syscache.h" @@ -507,3 +508,121 @@ pg_size_pretty(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(cstring_to_text(buf)); } + +/* + * Get the filenode of a relation + * + * This is expected to be used in queries like + * SELECT pg_relation_filenode(oid) FROM pg_class; + * That leads to a couple of choices. We work from the pg_class row alone + * rather than actually opening each relation, for efficiency. We don't + * fail if we can't find the relation --- some rows might be visible in + * the query's MVCC snapshot but already dead according to SnapshotNow. + * (Note: we could avoid using the catcache, but there's little point + * because the relation mapper also works "in the now".) We also don't + * fail if the relation doesn't have storage. In all these cases it + * seems better to quietly return NULL. + */ +Datum +pg_relation_filenode(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + Oid result; + HeapTuple tuple; + Form_pg_class relform; + + tuple = SearchSysCache(RELOID, + ObjectIdGetDatum(relid), + 0, 0, 0); + if (!HeapTupleIsValid(tuple)) + PG_RETURN_NULL(); + relform = (Form_pg_class) GETSTRUCT(tuple); + + switch (relform->relkind) + { + case RELKIND_RELATION: + case RELKIND_INDEX: + case RELKIND_SEQUENCE: + case RELKIND_TOASTVALUE: + /* okay, these have storage */ + if (relform->relfilenode) + result = relform->relfilenode; + else /* Consult the relation mapper */ + result = RelationMapOidToFilenode(relid, + relform->relisshared); + break; + + default: + /* no storage, return NULL */ + result = InvalidOid; + break; + } + + ReleaseSysCache(tuple); + + if (!OidIsValid(result)) + PG_RETURN_NULL(); + + PG_RETURN_OID(result); +} + +/* + * Get the pathname (relative to $PGDATA) of a relation + * + * See comments for pg_relation_filenode. + */ +Datum +pg_relation_filepath(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + HeapTuple tuple; + Form_pg_class relform; + RelFileNode rnode; + char *path; + + tuple = SearchSysCache(RELOID, + ObjectIdGetDatum(relid), + 0, 0, 0); + if (!HeapTupleIsValid(tuple)) + PG_RETURN_NULL(); + relform = (Form_pg_class) GETSTRUCT(tuple); + + switch (relform->relkind) + { + case RELKIND_RELATION: + case RELKIND_INDEX: + case RELKIND_SEQUENCE: + case RELKIND_TOASTVALUE: + /* okay, these have storage */ + + /* This logic should match RelationInitPhysicalAddr */ + if (relform->reltablespace) + rnode.spcNode = relform->reltablespace; + else + rnode.spcNode = MyDatabaseTableSpace; + if (rnode.spcNode == GLOBALTABLESPACE_OID) + rnode.dbNode = InvalidOid; + else + rnode.dbNode = MyDatabaseId; + if (relform->relfilenode) + rnode.relNode = relform->relfilenode; + else /* Consult the relation mapper */ + rnode.relNode = RelationMapOidToFilenode(relid, + relform->relisshared); + break; + + default: + /* no storage, return NULL */ + rnode.relNode = InvalidOid; + break; + } + + ReleaseSysCache(tuple); + + if (!OidIsValid(rnode.relNode)) + PG_RETURN_NULL(); + + path = relpath(rnode, MAIN_FORKNUM); + + PG_RETURN_TEXT_P(cstring_to_text(path)); +} diff --git a/src/backend/utils/cache/Makefile b/src/backend/utils/cache/Makefile index 617cb677f7a..d1caf8e4aeb 100644 --- a/src/backend/utils/cache/Makefile +++ b/src/backend/utils/cache/Makefile @@ -4,7 +4,7 @@ # Makefile for utils/cache # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/utils/cache/Makefile,v 1.25 2010/01/22 16:40:19 rhaas Exp $ +# $PostgreSQL: pgsql/src/backend/utils/cache/Makefile,v 1.26 2010/02/07 20:48:10 tgl Exp $ # #------------------------------------------------------------------------- @@ -12,7 +12,7 @@ subdir = src/backend/utils/cache top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = attoptcache.o catcache.o inval.o plancache.o relcache.o \ +OBJS = attoptcache.o catcache.o inval.o plancache.o relcache.o relmapper.o \ spccache.o syscache.o lsyscache.o typcache.o ts_cache.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 8b606a8da27..aac1e87d87e 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.148 2010/01/02 16:57:55 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.149 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,6 +28,7 @@ #endif #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" @@ -679,17 +680,6 @@ ResetCatalogCaches(void) * or a temp table being dropped at end of transaction, or a table created * during the current transaction that is being dropped because of abort.) * Remove all cache entries relevant to the specified relation OID. - * - * A special case occurs when relId is itself one of the cacheable system - * tables --- although those'll never be dropped, they can get flushed from - * the relcache (VACUUM causes this, for example). In that case we need - * to flush all cache entries that came from that table. (At one point we - * also tried to force re-execution of CatalogCacheInitializeCache for - * the cache(s) on that table. This is a bad idea since it leads to all - * kinds of trouble if a cache flush occurs while loading cache entries. - * We now avoid the need to do it by copying cc_tupdesc out of the relcache, - * rather than relying on the relcache to keep a tupdesc for us. Of course - * this assumes the tupdesc of a cachable system table will not change...) */ void CatalogCacheFlushRelation(Oid relId) @@ -706,14 +696,6 @@ CatalogCacheFlushRelation(Oid relId) if (cache->cc_tupdesc == NULL) continue; - /* Does this cache store tuples of the target relation itself? */ - if (cache->cc_tupdesc->attrs[0]->attrelid == relId) - { - /* Yes, so flush all its contents */ - ResetCatalogCache(cache); - continue; - } - /* Does this cache store tuples associated with relations at all? */ if (cache->cc_reloidattr == 0) continue; /* nope, leave it alone */ @@ -776,6 +758,46 @@ CatalogCacheFlushRelation(Oid relId) } /* + * CatalogCacheFlushCatalog + * + * Flush all catcache entries that came from the specified system catalog. + * This is needed after VACUUM FULL/CLUSTER on the catalog, since the + * tuples very likely now have different TIDs than before. (At one point + * we also tried to force re-execution of CatalogCacheInitializeCache for + * the cache(s) on that catalog. This is a bad idea since it leads to all + * kinds of trouble if a cache flush occurs while loading cache entries. + * We now avoid the need to do it by copying cc_tupdesc out of the relcache, + * rather than relying on the relcache to keep a tupdesc for us. Of course + * this assumes the tupdesc of a cachable system table will not change...) + */ +void +CatalogCacheFlushCatalog(Oid catId) +{ + CatCache *cache; + + CACHE2_elog(DEBUG2, "CatalogCacheFlushCatalog called for %u", catId); + + for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next) + { + /* We can ignore uninitialized caches, since they must be empty */ + if (cache->cc_tupdesc == NULL) + continue; + + /* Does this cache store tuples of the target catalog? */ + if (cache->cc_tupdesc->attrs[0]->attrelid == catId) + { + /* Yes, so flush all its contents */ + ResetCatalogCache(cache); + + /* Tell inval.c to call syscache callbacks for this cache */ + CallSyscacheCallbacks(cache->id, NULL); + } + } + + CACHE1_elog(DEBUG2, "end of CatalogCacheFlushCatalog call"); +} + +/* * InitCatCache * * This allocates and initializes a cache for a system catalog relation. diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 99aad752bb3..96439fda18a 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -80,7 +80,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.93 2010/02/03 01:14:17 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.94 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -96,6 +96,7 @@ #include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/relmapper.h" #include "utils/syscache.h" @@ -326,6 +327,21 @@ AddCatcacheInvalidationMessage(InvalidationListHeader *hdr, } /* + * Add a whole-catalog inval entry + */ +static void +AddCatalogInvalidationMessage(InvalidationListHeader *hdr, + Oid dbId, Oid catId) +{ + SharedInvalidationMessage msg; + + msg.cat.id = SHAREDINVALCATALOG_ID; + msg.cat.dbId = dbId; + msg.cat.catId = catId; + AddInvalidationMessage(&hdr->cclist, &msg); +} + +/* * Add a relcache inval entry */ static void @@ -407,6 +423,18 @@ RegisterCatcacheInvalidation(int cacheId, } /* + * RegisterCatalogInvalidation + * + * Register an invalidation event for all catcache entries from a catalog. + */ +static void +RegisterCatalogInvalidation(Oid dbId, Oid catId) +{ + AddCatalogInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs, + dbId, catId); +} + +/* * RegisterRelcacheInvalidation * * As above, but register a relcache invalidation event. @@ -443,30 +471,32 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId) static void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) { - int i; - if (msg->id >= 0) { - if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == 0) + if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == InvalidOid) { CatalogCacheIdInvalidate(msg->cc.id, msg->cc.hashValue, &msg->cc.tuplePtr); - for (i = 0; i < syscache_callback_count; i++) - { - struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i; + CallSyscacheCallbacks(msg->cc.id, &msg->cc.tuplePtr); + } + } + else if (msg->id == SHAREDINVALCATALOG_ID) + { + if (msg->cat.dbId == MyDatabaseId || msg->cat.dbId == InvalidOid) + { + CatalogCacheFlushCatalog(msg->cat.catId); - if (ccitem->id == msg->cc.id) - (*ccitem->function) (ccitem->arg, - msg->cc.id, &msg->cc.tuplePtr); - } + /* CatalogCacheFlushCatalog calls CallSyscacheCallbacks as needed */ } } else if (msg->id == SHAREDINVALRELCACHE_ID) { if (msg->rc.dbId == MyDatabaseId || msg->rc.dbId == InvalidOid) { + int i; + RelationCacheInvalidateEntry(msg->rc.relId); for (i = 0; i < relcache_callback_count; i++) @@ -485,6 +515,14 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) */ smgrclosenode(msg->sm.rnode); } + else if (msg->id == SHAREDINVALRELMAP_ID) + { + /* We only care about our own database and shared catalogs */ + if (msg->rm.dbId == InvalidOid) + RelationMapInvalidate(true); + else if (msg->rm.dbId == MyDatabaseId) + RelationMapInvalidate(false); + } else elog(FATAL, "unrecognized SI message id: %d", msg->id); } @@ -506,7 +544,7 @@ InvalidateSystemCaches(void) int i; ResetCatalogCaches(); - RelationCacheInvalidate(); /* gets smgr cache too */ + RelationCacheInvalidate(); /* gets smgr and relmap too */ for (i = 0; i < syscache_callback_count; i++) { @@ -874,7 +912,7 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, else { /* - * Invalidation message is a SHAREDINVALSMGR_ID + * Invalidation message is a catalog or nontransactional inval, * which never cause relcache file invalidation, * so we ignore them, no matter which db they're for. */ @@ -1183,6 +1221,30 @@ CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple) } /* + * CacheInvalidateCatalog + * Register invalidation of the whole content of a system catalog. + * + * This is normally used in VACUUM FULL/CLUSTER, where we haven't so much + * changed any tuples as moved them around. Some uses of catcache entries + * expect their TIDs to be correct, so we have to blow away the entries. + * + * Note: we expect caller to verify that the rel actually is a system + * catalog. If it isn't, no great harm is done, just a wasted sinval message. + */ +void +CacheInvalidateCatalog(Oid catalogId) +{ + Oid databaseId; + + if (IsSharedRelation(catalogId)) + databaseId = InvalidOid; + else + databaseId = MyDatabaseId; + + RegisterCatalogInvalidation(databaseId, catalogId); +} + +/* * CacheInvalidateRelcache * Register invalidation of the specified relation's relcache entry * at end of command. @@ -1277,6 +1339,31 @@ CacheInvalidateSmgr(RelFileNode rnode) SendSharedInvalidMessages(&msg, 1); } +/* + * CacheInvalidateRelmap + * Register invalidation of the relation mapping for a database, + * or for the shared catalogs if databaseId is zero. + * + * Sending this type of invalidation msg forces other backends to re-read + * the indicated relation mapping file. It is also necessary to send a + * relcache inval for the specific relations whose mapping has been altered, + * else the relcache won't get updated with the new filenode data. + * + * Note: because these messages are nontransactional, they won't be captured + * in commit/abort WAL entries. Instead, calls to CacheInvalidateRelmap() + * should happen in low-level relmapper.c routines, which are executed while + * replaying WAL as well as when creating it. + */ +void +CacheInvalidateRelmap(Oid databaseId) +{ + SharedInvalidationMessage msg; + + msg.rm.id = SHAREDINVALRELMAP_ID; + msg.rm.dbId = databaseId; + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheRegisterSyscacheCallback @@ -1323,3 +1410,23 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, ++relcache_callback_count; } + +/* + * CallSyscacheCallbacks + * + * This is exported so that CatalogCacheFlushCatalog can call it, saving + * this module from knowing which catcache IDs correspond to which catalogs. + */ +void +CallSyscacheCallbacks(int cacheid, ItemPointer tuplePtr) +{ + int i; + + for (i = 0; i < syscache_callback_count; i++) + { + struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i; + + if (ccitem->id == cacheid) + (*ccitem->function) (ccitem->arg, cacheid, tuplePtr); + } +} diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index e71416c0f70..ff85195ed13 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.302 2010/02/04 00:09:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.303 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -72,6 +72,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/relcache.h" +#include "utils/relmapper.h" #include "utils/resowner.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -838,6 +839,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) */ relid = HeapTupleGetOid(pg_class_tuple); relp = (Form_pg_class) GETSTRUCT(pg_class_tuple); + Assert(relid == targetRelId); /* * allocate storage for the relation descriptor, and copy pg_class_tuple @@ -927,6 +929,10 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) /* * Initialize the physical addressing info (RelFileNode) for a relcache entry + * + * Note: at the physical level, relations in the pg_global tablespace must + * be treated as shared, even if relisshared isn't set. Hence we do not + * look at relisshared here. */ static void RelationInitPhysicalAddr(Relation relation) @@ -935,11 +941,22 @@ RelationInitPhysicalAddr(Relation relation) relation->rd_node.spcNode = relation->rd_rel->reltablespace; else relation->rd_node.spcNode = MyDatabaseTableSpace; - if (relation->rd_rel->relisshared) + if (relation->rd_node.spcNode == GLOBALTABLESPACE_OID) relation->rd_node.dbNode = InvalidOid; else relation->rd_node.dbNode = MyDatabaseId; - relation->rd_node.relNode = relation->rd_rel->relfilenode; + if (relation->rd_rel->relfilenode) + relation->rd_node.relNode = relation->rd_rel->relfilenode; + else + { + /* Consult the relation mapper */ + relation->rd_node.relNode = + RelationMapOidToFilenode(relation->rd_id, + relation->rd_rel->relisshared); + if (!OidIsValid(relation->rd_node.relNode)) + elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", + RelationGetRelationName(relation), relation->rd_id); + } } /* @@ -1496,7 +1513,18 @@ formrdesc(const char *relationName, Oid relationReltype, * initialize relation id from info in att array (my, this is ugly) */ RelationGetRelid(relation) = relation->rd_att->attrs[0]->attrelid; - relation->rd_rel->relfilenode = RelationGetRelid(relation); + + /* + * All relations made with formrdesc are mapped. This is necessarily so + * because there is no other way to know what filenode they currently + * have. In bootstrap mode, add them to the initial relation mapper data, + * specifying that the initial filenode is the same as the OID. + */ + relation->rd_rel->relfilenode = InvalidOid; + if (IsBootstrapProcessingMode()) + RelationMapUpdateMap(RelationGetRelid(relation), + RelationGetRelid(relation), + isshared, true); /* * initialize the relation lock manager information @@ -1841,7 +1869,9 @@ RelationClearRelation(Relation relation, bool rebuild) * Never, never ever blow away a nailed-in system relation, because we'd * be unable to recover. However, we must reset rd_targblock, in case we * got called because of a relation cache flush that was triggered by - * VACUUM. Likewise reset the fsm and vm size info. + * VACUUM. Likewise reset the fsm and vm size info. Also, redo + * RelationInitPhysicalAddr in case it is a mapped relation whose mapping + * changed. * * If it's a nailed index, then we need to re-read the pg_class row to see * if its relfilenode changed. We can't necessarily do that here, because @@ -1855,6 +1885,9 @@ RelationClearRelation(Relation relation, bool rebuild) relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; relation->rd_vm_nblocks = InvalidBlockNumber; + /* We must recalculate physical address in case it changed */ + RelationInitPhysicalAddr(relation); + if (relation->rd_rel->relkind == RELKIND_INDEX) { relation->rd_isvalid = false; /* needs to be revalidated */ @@ -1885,7 +1918,8 @@ RelationClearRelation(Relation relation, bool rebuild) /* * Clear out catcache's entries for this relation. This is a bit of - * a hack, but it's a convenient place to do it. + * a hack, but it's a convenient place to do it. (XXX do we really + * still need this?) */ CatalogCacheFlushRelation(RelationGetRelid(relation)); @@ -2104,7 +2138,7 @@ RelationCacheInvalidateEntry(Oid relationId) * RelationCacheInvalidate * Blow away cached relation descriptors that have zero reference counts, * and rebuild those with positive reference counts. Also reset the smgr - * relation cache. + * relation cache and re-read relation mapping data. * * This is currently used only to recover from SI message buffer overflow, * so we do not touch new-in-transaction relations; they cannot be targets @@ -2190,6 +2224,11 @@ RelationCacheInvalidate(void) */ smgrcloseall(); + /* + * Reload relation mapping data before starting to reconstruct cache. + */ + RelationMapInvalidateAll(); + /* Phase 2: rebuild the items found to need rebuild in phase 1 */ foreach(l, rebuildFirstList) { @@ -2206,6 +2245,25 @@ RelationCacheInvalidate(void) } /* + * RelationCloseSmgrByOid - close a relcache entry's smgr link + * + * Needed in some cases where we are changing a relation's physical mapping. + * The link will be automatically reopened on next use. + */ +void +RelationCloseSmgrByOid(Oid relationId) +{ + Relation relation; + + RelationIdCacheLookup(relationId, relation); + + if (!PointerIsValid(relation)) + return; /* not in cache, nothing to do */ + + RelationCloseSmgr(relation); +} + +/* * AtEOXact_RelationCache * * Clean up the relcache at main-transaction commit or abort. @@ -2393,7 +2451,8 @@ RelationBuildLocalRelation(const char *relname, TupleDesc tupDesc, Oid relid, Oid reltablespace, - bool shared_relation) + bool shared_relation, + bool mapped_relation) { Relation rel; MemoryContext oldcxt; @@ -2409,6 +2468,8 @@ RelationBuildLocalRelation(const char *relname, * * XXX this list had better match the relations specially handled in * RelationCacheInitializePhase2/3. + * + * XXX do we need this at all?? */ switch (relid) { @@ -2434,6 +2495,9 @@ RelationBuildLocalRelation(const char *relname, elog(ERROR, "shared_relation flag for \"%s\" does not match IsSharedRelation(%u)", relname, relid); + /* Shared relations had better be mapped, too */ + Assert(mapped_relation || !shared_relation); + /* * switch to the cache context to create the relcache entry. */ @@ -2512,7 +2576,9 @@ RelationBuildLocalRelation(const char *relname, /* * Insert relation physical and logical identifiers (OIDs) into the right * places. Note that the physical ID (relfilenode) is initially the same - * as the logical ID (OID). + * as the logical ID (OID); except that for a mapped relation, we set + * relfilenode to zero and rely on RelationInitPhysicalAddr to consult + * the map. */ rel->rd_rel->relisshared = shared_relation; rel->rd_rel->relistemp = rel->rd_istemp; @@ -2522,9 +2588,17 @@ RelationBuildLocalRelation(const char *relname, for (i = 0; i < natts; i++) rel->rd_att->attrs[i]->attrelid = relid; - rel->rd_rel->relfilenode = relid; rel->rd_rel->reltablespace = reltablespace; + if (mapped_relation) + { + rel->rd_rel->relfilenode = InvalidOid; + /* Add it to the active mapping information */ + RelationMapUpdateMap(relid, relid, shared_relation, true); + } + else + rel->rd_rel->relfilenode = relid; + RelationInitLockInfo(rel); /* see lmgr.c */ RelationInitPhysicalAddr(rel); @@ -2577,24 +2651,16 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid) HeapTuple tuple; Form_pg_class classform; - /* Can't change relfilenode for nailed tables (indexes ok though) */ - Assert(!relation->rd_isnailed || - relation->rd_rel->relkind == RELKIND_INDEX); - /* Can't change for shared tables or indexes */ - Assert(!relation->rd_rel->relisshared); /* Indexes must have Invalid frozenxid; other relations must not */ Assert((relation->rd_rel->relkind == RELKIND_INDEX && freezeXid == InvalidTransactionId) || TransactionIdIsNormal(freezeXid)); /* Allocate a new relfilenode */ - newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, - relation->rd_rel->relisshared, - NULL); + newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL); /* - * Find the pg_class tuple for the given relation. This is not used - * during bootstrap, so okay to use heap_update always. + * Get a writable copy of the pg_class tuple for the given relation. */ pg_class = heap_open(RelationRelationId, RowExclusiveLock); @@ -2623,12 +2689,23 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid) RelationDropStorage(relation); /* - * Now update the pg_class row. + * Now update the pg_class row. However, if we're dealing with a mapped + * index, pg_class.relfilenode doesn't change; instead we have to send + * the update to the relation mapper. */ - classform->relfilenode = newrelfilenode; + if (RelationIsMapped(relation)) + RelationMapUpdateMap(RelationGetRelid(relation), + newrelfilenode, + relation->rd_rel->relisshared, + false); + else + classform->relfilenode = newrelfilenode; + + /* These changes are safe even for a mapped relation */ classform->relpages = 0; /* it's empty until further notice */ classform->reltuples = 0; classform->relfrozenxid = freezeXid; + simple_heap_update(pg_class, &tuple->t_self, tuple); CatalogUpdateIndexes(pg_class, tuple); @@ -2637,8 +2714,8 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid) heap_close(pg_class, RowExclusiveLock); /* - * Make the pg_class row change visible. This will cause the relcache - * entry to get updated, too. + * Make the pg_class row change visible, as well as the relation map + * change if any. This will cause the relcache entry to get updated, too. */ CommandCounterIncrement(); @@ -2687,6 +2764,11 @@ RelationCacheInitialize(void) ctl.hash = oid_hash; RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE, &ctl, HASH_ELEM | HASH_FUNCTION); + + /* + * relation mapper needs initialized too + */ + RelationMapInitialize(); } /* @@ -2705,6 +2787,11 @@ RelationCacheInitializePhase2(void) MemoryContext oldcxt; /* + * relation mapper needs initialized too + */ + RelationMapInitializePhase2(); + + /* * In bootstrap mode, pg_database isn't there yet anyway, so do nothing. */ if (IsBootstrapProcessingMode()) @@ -2753,6 +2840,11 @@ RelationCacheInitializePhase3(void) bool needNewCacheFile = !criticalSharedRelcachesBuilt; /* + * relation mapper needs initialized too + */ + RelationMapInitializePhase3(); + + /* * switch to cache memory context */ oldcxt = MemoryContextSwitchTo(CacheMemoryContext); diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c new file mode 100644 index 00000000000..b22cadf6eb5 --- /dev/null +++ b/src/backend/utils/cache/relmapper.c @@ -0,0 +1,913 @@ +/*------------------------------------------------------------------------- + * + * relmapper.c + * Catalog-to-filenode mapping + * + * For most tables, the physical file underlying the table is specified by + * pg_class.relfilenode. However, that obviously won't work for pg_class + * itself, nor for the other "nailed" catalogs for which we have to be able + * to set up working Relation entries without access to pg_class. It also + * does not work for shared catalogs, since there is no practical way to + * update other databases' pg_class entries when relocating a shared catalog. + * Therefore, for these special catalogs (henceforth referred to as "mapped + * catalogs") we rely on a separately maintained file that shows the mapping + * from catalog OIDs to filenode numbers. Each database has a map file for + * its local mapped catalogs, and there is a separate map file for shared + * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries. + * + * Relocation of a normal table is committed (ie, the new physical file becomes + * authoritative) when the pg_class row update commits. For mapped catalogs, + * the act of updating the map file is effectively commit of the relocation. + * We postpone the file update till just before commit of the transaction + * doing the rewrite, but there is necessarily a window between. Therefore + * mapped catalogs can only be relocated by operations such as VACUUM FULL + * and CLUSTER, which make no transactionally-significant changes: it must be + * safe for the new file to replace the old, even if the transaction itself + * aborts. An important factor here is that the indexes and toast table of + * a mapped catalog must also be mapped, so that the rewrites/relocations of + * all these files commit in a single map file update rather than being tied + * to transaction commit. + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/cache/relmapper.c,v 1.1 2010/02/07 20:48:10 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <unistd.h> + +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/pg_tablespace.h" +#include "catalog/storage.h" +#include "miscadmin.h" +#include "storage/fd.h" +#include "storage/lwlock.h" +#include "utils/inval.h" +#include "utils/pg_crc.h" +#include "utils/relmapper.h" + + +/* + * The map file is critical data: we have no automatic method for recovering + * from loss or corruption of it. We use a CRC so that we can detect + * corruption. To minimize the risk of failed updates, the map file should + * be kept to no more than one standard-size disk sector (ie 512 bytes), + * and we use overwrite-in-place rather than playing renaming games. + * The struct layout below is designed to occupy exactly 512 bytes, which + * might make filesystem updates a bit more efficient. + * + * Entries in the mappings[] array are in no particular order. We could + * speed searching by insisting on OID order, but it really shouldn't be + * worth the trouble given the intended size of the mapping sets. + */ +#define RELMAPPER_FILENAME "pg_filenode.map" + +#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */ + +#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */ + +typedef struct RelMapping +{ + Oid mapoid; /* OID of a catalog */ + Oid mapfilenode; /* its filenode number */ +} RelMapping; + +typedef struct RelMapFile +{ + int32 magic; /* always RELMAPPER_FILEMAGIC */ + int32 num_mappings; /* number of valid RelMapping entries */ + RelMapping mappings[MAX_MAPPINGS]; + int32 crc; /* CRC of all above */ + int32 pad; /* to make the struct size be 512 exactly */ +} RelMapFile; + +/* + * The currently known contents of the shared map file and our database's + * local map file are stored here. These can be reloaded from disk + * immediately whenever we receive an update sinval message. + */ +static RelMapFile shared_map; +static RelMapFile local_map; + +/* + * We use the same RelMapFile data structure to track uncommitted local + * changes in the mappings (but note the magic and crc fields are not made + * valid in these variables). Currently, map updates are not allowed within + * subtransactions, so one set of transaction-level changes is sufficient. + * + * The active_xxx variables contain updates that are valid in our transaction + * and should be honored by RelationMapOidToFilenode. The pending_xxx + * variables contain updates we have been told about that aren't active yet; + * they will become active at the next CommandCounterIncrement. This setup + * lets map updates act similarly to updates of pg_class rows, ie, they + * become visible only at the next CommandCounterIncrement boundary. + */ +static RelMapFile active_shared_updates; +static RelMapFile active_local_updates; +static RelMapFile pending_shared_updates; +static RelMapFile pending_local_updates; + + +/* non-export function prototypes */ +static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, + bool add_okay); +static void merge_map_updates(RelMapFile *map, const RelMapFile *updates, + bool add_okay); +static void load_relmap_file(bool shared); +static void write_relmap_file(bool shared, RelMapFile *newmap, + bool write_wal, bool send_sinval, bool preserve_files, + Oid dbid, Oid tsid, const char *dbpath); +static void perform_relmap_update(bool shared, const RelMapFile *updates); + + +/* + * RelationMapOidToFilenode + * + * The raison d' etre ... given a relation OID, look up its filenode. + * + * Although shared and local relation OIDs should never overlap, the caller + * always knows which we need --- so pass that information to avoid useless + * searching. + * + * Returns InvalidOid if the OID is not known (which should never happen, + * but the caller is in a better position to report a meaningful error). + */ +Oid +RelationMapOidToFilenode(Oid relationId, bool shared) +{ + const RelMapFile *map; + int32 i; + + /* If there are active updates, believe those over the main maps */ + if (shared) + { + map = &active_shared_updates; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + map = &shared_map; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + } + else + { + map = &active_local_updates; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + map = &local_map; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + } + + return InvalidOid; +} + +/* + * RelationMapUpdateMap + * + * Install a new relfilenode mapping for the specified relation. + * + * If immediate is true (or we're bootstrapping), the mapping is activated + * immediately. Otherwise it is made pending until CommandCounterIncrement. + */ +void +RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, + bool immediate) +{ + RelMapFile *map; + + if (IsBootstrapProcessingMode()) + { + /* + * In bootstrap mode, the mapping gets installed in permanent map. + */ + if (shared) + map = &shared_map; + else + map = &local_map; + } + else + { + /* + * We don't currently support map changes within subtransactions. + * This could be done with more bookkeeping infrastructure, but it + * doesn't presently seem worth it. + */ + if (GetCurrentTransactionNestLevel() > 1) + elog(ERROR, "cannot change relation mapping within subtransaction"); + + if (immediate) + { + /* Make it active, but only locally */ + if (shared) + map = &active_shared_updates; + else + map = &active_local_updates; + } + else + { + /* Make it pending */ + if (shared) + map = &pending_shared_updates; + else + map = &pending_local_updates; + } + } + apply_map_update(map, relationId, fileNode, true); +} + +/* + * apply_map_update + * + * Insert a new mapping into the given map variable, replacing any existing + * mapping for the same relation. + * + * In some cases the caller knows there must be an existing mapping; pass + * add_okay = false to draw an error if not. + */ +static void +apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay) +{ + int32 i; + + /* Replace any existing mapping */ + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + { + map->mappings[i].mapfilenode = fileNode; + return; + } + } + + /* Nope, need to add a new mapping */ + if (!add_okay) + elog(ERROR, "attempt to apply a mapping to unmapped relation %u", + relationId); + if (map->num_mappings >= MAX_MAPPINGS) + elog(ERROR, "ran out of space in relation map"); + map->mappings[map->num_mappings].mapoid = relationId; + map->mappings[map->num_mappings].mapfilenode = fileNode; + map->num_mappings++; +} + +/* + * merge_map_updates + * + * Merge all the updates in the given pending-update map into the target map. + * This is just a bulk form of apply_map_update. + */ +static void +merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay) +{ + int32 i; + + for (i = 0; i < updates->num_mappings; i++) + { + apply_map_update(map, + updates->mappings[i].mapoid, + updates->mappings[i].mapfilenode, + add_okay); + } +} + +/* + * RelationMapRemoveMapping + * + * Remove a relation's entry in the map. This is only allowed for "active" + * (but not committed) local mappings. We need it so we can back out the + * entry for the transient target file when doing VACUUM FULL/CLUSTER on + * a mapped relation. + */ +void +RelationMapRemoveMapping(Oid relationId) +{ + RelMapFile *map = &active_local_updates; + int32 i; + + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + { + /* Found it, collapse it out */ + map->mappings[i] = map->mappings[map->num_mappings - 1]; + map->num_mappings--; + return; + } + } + elog(ERROR, "could not find temporary mapping for relation %u", + relationId); +} + +/* + * RelationMapInvalidate + * + * This routine is invoked for SI cache flush messages. We must re-read + * the indicated map file. However, we might receive a SI message in a + * process that hasn't yet, and might never, load the mapping files; + * for example the autovacuum launcher, which *must not* try to read + * a local map since it is attached to no particular database. + * So, re-read only if the map is valid now. + */ +void +RelationMapInvalidate(bool shared) +{ + if (shared) + { + if (shared_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(true); + } + else + { + if (local_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(false); + } +} + +/* + * RelationMapInvalidateAll + * + * Reload all map files. This is used to recover from SI message buffer + * overflow: we can't be sure if we missed an inval message. + * Again, reload only currently-valid maps. + */ +void +RelationMapInvalidateAll(void) +{ + if (shared_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(true); + if (local_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(false); +} + +/* + * AtCCI_RelationMap + * + * Activate any "pending" relation map updates at CommandCounterIncrement time. + */ +void +AtCCI_RelationMap(void) +{ + if (pending_shared_updates.num_mappings != 0) + { + merge_map_updates(&active_shared_updates, + &pending_shared_updates, + true); + pending_shared_updates.num_mappings = 0; + } + if (pending_local_updates.num_mappings != 0) + { + merge_map_updates(&active_local_updates, + &pending_local_updates, + true); + pending_local_updates.num_mappings = 0; + } +} + +/* + * AtEOXact_RelationMap + * + * Handle relation mapping at main-transaction commit or abort. + * + * During commit, this must be called as late as possible before the actual + * transaction commit, so as to minimize the window where the transaction + * could still roll back after committing map changes. Although nothing + * critically bad happens in such a case, we still would prefer that it + * not happen, since we'd possibly be losing useful updates to the relations' + * pg_class row(s). + * + * During abort, we just have to throw away any pending map changes. + * Normal post-abort cleanup will take care of fixing relcache entries. + */ +void +AtEOXact_RelationMap(bool isCommit) +{ + if (isCommit) + { + /* + * We should not get here with any "pending" updates. (We could + * logically choose to treat such as committed, but in the current + * code this should never happen.) + */ + Assert(pending_shared_updates.num_mappings == 0); + Assert(pending_local_updates.num_mappings == 0); + + /* + * Write any active updates to the actual map files, then reset them. + */ + if (active_shared_updates.num_mappings != 0) + { + perform_relmap_update(true, &active_shared_updates); + active_shared_updates.num_mappings = 0; + } + if (active_local_updates.num_mappings != 0) + { + perform_relmap_update(false, &active_local_updates); + active_local_updates.num_mappings = 0; + } + } + else + { + /* Abort --- drop all local and pending updates */ + active_shared_updates.num_mappings = 0; + active_local_updates.num_mappings = 0; + pending_shared_updates.num_mappings = 0; + pending_local_updates.num_mappings = 0; + } +} + +/* + * AtPrepare_RelationMap + * + * Handle relation mapping at PREPARE. + * + * Currently, we don't support preparing any transaction that changes the map. + */ +void +AtPrepare_RelationMap(void) +{ + if (active_shared_updates.num_mappings != 0 || + active_local_updates.num_mappings != 0 || + pending_shared_updates.num_mappings != 0 || + pending_local_updates.num_mappings != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that modified relation mapping"))); +} + +/* + * CheckPointRelationMap + * + * This is called during a checkpoint. It must ensure that any relation map + * updates that were WAL-logged before the start of the checkpoint are + * securely flushed to disk and will not need to be replayed later. This + * seems unlikely to be a performance-critical issue, so we use a simple + * method: we just take and release the RelationMappingLock. This ensures + * that any already-logged map update is complete, because write_relmap_file + * will fsync the map file before the lock is released. + */ +void +CheckPointRelationMap(void) +{ + LWLockAcquire(RelationMappingLock, LW_SHARED); + LWLockRelease(RelationMappingLock); +} + +/* + * RelationMapFinishBootstrap + * + * Write out the initial relation mapping files at the completion of + * bootstrap. All the mapped files should have been made known to us + * via RelationMapUpdateMap calls. + */ +void +RelationMapFinishBootstrap(void) +{ + Assert(IsBootstrapProcessingMode()); + + /* Shouldn't be anything "pending" ... */ + Assert(active_shared_updates.num_mappings == 0); + Assert(active_local_updates.num_mappings == 0); + Assert(pending_shared_updates.num_mappings == 0); + Assert(pending_local_updates.num_mappings == 0); + + /* Write the files; no WAL or sinval needed */ + write_relmap_file(true, &shared_map, false, false, false, + InvalidOid, GLOBALTABLESPACE_OID, NULL); + write_relmap_file(false, &local_map, false, false, false, + MyDatabaseId, MyDatabaseTableSpace, DatabasePath); +} + +/* + * RelationMapInitialize + * + * This initializes the mapper module at process startup. We can't access the + * database yet, so just make sure the maps are empty. + */ +void +RelationMapInitialize(void) +{ + /* The static variables should initialize to zeroes, but let's be sure */ + shared_map.magic = 0; /* mark it not loaded */ + local_map.magic = 0; + shared_map.num_mappings = 0; + local_map.num_mappings = 0; + active_shared_updates.num_mappings = 0; + active_local_updates.num_mappings = 0; + pending_shared_updates.num_mappings = 0; + pending_local_updates.num_mappings = 0; +} + +/* + * RelationMapInitializePhase2 + * + * This is called to prepare for access to pg_database during startup. + * We should be able to read the shared map file now. + */ +void +RelationMapInitializePhase2(void) +{ + /* + * In bootstrap mode, the map file isn't there yet, so do nothing. + */ + if (IsBootstrapProcessingMode()) + return; + + /* + * Load the shared map file, die on error. + */ + load_relmap_file(true); +} + +/* + * RelationMapInitializePhase3 + * + * This is called as soon as we have determined MyDatabaseId and set up + * DatabasePath. At this point we should be able to read the local map file. + */ +void +RelationMapInitializePhase3(void) +{ + /* + * In bootstrap mode, the map file isn't there yet, so do nothing. + */ + if (IsBootstrapProcessingMode()) + return; + + /* + * Load the local map file, die on error. + */ + load_relmap_file(false); +} + +/* + * load_relmap_file -- load data from the shared or local map file + * + * Because the map file is essential for access to core system catalogs, + * failure to read it is a fatal error. + * + * Note that the local case requires DatabasePath to be set up. + */ +static void +load_relmap_file(bool shared) +{ + RelMapFile *map; + char mapfilename[MAXPGPATH]; + pg_crc32 crc; + int fd; + + if (shared) + { + snprintf(mapfilename, sizeof(mapfilename), "global/%s", + RELMAPPER_FILENAME); + map = &shared_map; + } + else + { + snprintf(mapfilename, sizeof(mapfilename), "%s/%s", + DatabasePath, RELMAPPER_FILENAME); + map = &local_map; + } + + /* Read data ... */ + fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); + if (fd < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open relation mapping file \"%s\": %m", + mapfilename))); + + /* + * Note: we could take RelationMappingLock in shared mode here, but it + * seems unnecessary since our read() should be atomic against any + * concurrent updater's write(). If the file is updated shortly after + * we look, the sinval signaling mechanism will make us re-read it + * before we are able to access any relation that's affected by the + * change. + */ + if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read relation mapping file \"%s\": %m", + mapfilename))); + + close(fd); + + /* check for correct magic number, etc */ + if (map->magic != RELMAPPER_FILEMAGIC || + map->num_mappings < 0 || + map->num_mappings > MAX_MAPPINGS) + ereport(FATAL, + (errmsg("relation mapping file \"%s\" contains invalid data", + mapfilename))); + + /* verify the CRC */ + INIT_CRC32(crc); + COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc)); + FIN_CRC32(crc); + + if (!EQ_CRC32(crc, map->crc)) + ereport(FATAL, + (errmsg("relation mapping file \"%s\" contains incorrect checksum", + mapfilename))); +} + +/* + * Write out a new shared or local map file with the given contents. + * + * The magic number and CRC are automatically updated in *newmap. On + * success, we copy the data to the appropriate permanent static variable. + * + * If write_wal is TRUE then an appropriate WAL message is emitted. + * (It will be false for bootstrap and WAL replay cases.) + * + * If send_sinval is TRUE then a SI invalidation message is sent. + * (This should be true except in bootstrap case.) + * + * If preserve_files is TRUE then the storage manager is warned not to + * delete the files listed in the map. + * + * Because this may be called during WAL replay when MyDatabaseId, + * DatabasePath, etc aren't valid, we require the caller to pass in suitable + * values. The caller is also responsible for being sure no concurrent + * map update could be happening. + */ +static void +write_relmap_file(bool shared, RelMapFile *newmap, + bool write_wal, bool send_sinval, bool preserve_files, + Oid dbid, Oid tsid, const char *dbpath) +{ + int fd; + RelMapFile *realmap; + char mapfilename[MAXPGPATH]; + + /* + * Fill in the overhead fields and update CRC. + */ + newmap->magic = RELMAPPER_FILEMAGIC; + if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) + elog(ERROR, "attempt to write bogus relation mapping"); + + INIT_CRC32(newmap->crc); + COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); + FIN_CRC32(newmap->crc); + + /* + * Open the target file. We prefer to do this before entering the + * critical section, so that an open() failure need not force PANIC. + * + * Note: since we use BasicOpenFile, we are nominally responsible for + * ensuring the fd is closed on error. In practice, this isn't important + * because either an error happens inside the critical section, or we + * are in bootstrap or WAL replay; so an error past this point is always + * fatal anyway. + */ + if (shared) + { + snprintf(mapfilename, sizeof(mapfilename), "global/%s", + RELMAPPER_FILENAME); + realmap = &shared_map; + } + else + { + snprintf(mapfilename, sizeof(mapfilename), "%s/%s", + dbpath, RELMAPPER_FILENAME); + realmap = &local_map; + } + + fd = BasicOpenFile(mapfilename, + O_WRONLY | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open relation mapping file \"%s\": %m", + mapfilename))); + + if (write_wal) + { + xl_relmap_update xlrec; + XLogRecData rdata[2]; + XLogRecPtr lsn; + + /* now errors are fatal ... */ + START_CRIT_SECTION(); + + xlrec.dbid = dbid; + xlrec.tsid = tsid; + xlrec.nbytes = sizeof(RelMapFile); + + rdata[0].data = (char *) (&xlrec); + rdata[0].len = MinSizeOfRelmapUpdate; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + rdata[1].data = (char *) newmap; + rdata[1].len = sizeof(RelMapFile); + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); + + /* As always, WAL must hit the disk before the data update does */ + XLogFlush(lsn); + } + + errno = 0; + if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to relation mapping file \"%s\": %m", + mapfilename))); + } + + /* + * We choose to fsync the data to disk before considering the task done. + * It would be possible to relax this if it turns out to be a performance + * issue, but it would complicate checkpointing --- see notes for + * CheckPointRelationMap. + */ + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync relation mapping file \"%s\": %m", + mapfilename))); + + if (close(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close relation mapping file \"%s\": %m", + mapfilename))); + + /* + * Now that the file is safely on disk, send sinval message to let other + * backends know to re-read it. We must do this inside the critical + * section: if for some reason we fail to send the message, we have to + * force a database-wide PANIC. Otherwise other backends might continue + * execution with stale mapping information, which would be catastrophic + * as soon as others began to use the now-committed data. + */ + if (send_sinval) + CacheInvalidateRelmap(dbid); + + /* + * Make sure that the files listed in the map are not deleted if the + * outer transaction aborts. This had better be within the critical + * section too: it's not likely to fail, but if it did, we'd arrive + * at transaction abort with the files still vulnerable. PANICing + * will leave things in a good state on-disk. + * + * Note: we're cheating a little bit here by assuming that mapped files + * are either in pg_global or the database's default tablespace. + */ + if (preserve_files) + { + int32 i; + + for (i = 0; i < newmap->num_mappings; i++) + { + RelFileNode rnode; + + rnode.spcNode = tsid; + rnode.dbNode = dbid; + rnode.relNode = newmap->mappings[i].mapfilenode; + RelationPreserveStorage(rnode); + } + } + + /* Success, update permanent copy */ + memcpy(realmap, newmap, sizeof(RelMapFile)); + + /* Critical section done */ + if (write_wal) + END_CRIT_SECTION(); +} + +/* + * Merge the specified updates into the appropriate "real" map, + * and write out the changes. This function must be used for committing + * updates during normal multiuser operation. + */ +static void +perform_relmap_update(bool shared, const RelMapFile *updates) +{ + RelMapFile newmap; + + /* + * Anyone updating a relation's mapping info should take exclusive lock + * on that rel and hold it until commit. This ensures that there will + * not be concurrent updates on the same mapping value; but there could + * easily be concurrent updates on different values in the same file. + * We cover that by acquiring the RelationMappingLock, re-reading the + * target file to ensure it's up to date, applying the updates, and + * writing the data before releasing RelationMappingLock. + * + * There is only one RelationMappingLock. In principle we could try to + * have one per mapping file, but it seems unlikely to be worth the + * trouble. + */ + LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE); + + /* Be certain we see any other updates just made */ + load_relmap_file(shared); + + /* Prepare updated data in a local variable */ + if (shared) + memcpy(&newmap, &shared_map, sizeof(RelMapFile)); + else + memcpy(&newmap, &local_map, sizeof(RelMapFile)); + + /* Apply the updates to newmap. No new mappings should appear. */ + merge_map_updates(&newmap, updates, false); + + /* Write out the updated map and do other necessary tasks */ + write_relmap_file(shared, &newmap, true, true, true, + (shared ? InvalidOid : MyDatabaseId), + (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace), + DatabasePath); + + /* Now we can release the lock */ + LWLockRelease(RelationMappingLock); +} + +/* + * RELMAP resource manager's routines + */ +void +relmap_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + /* Backup blocks are not used in relmap records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + + if (info == XLOG_RELMAP_UPDATE) + { + xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record); + RelMapFile newmap; + char *dbpath; + + if (xlrec->nbytes != sizeof(RelMapFile)) + elog(PANIC, "relmap_redo: wrong size %u in relmap update record", + xlrec->nbytes); + memcpy(&newmap, xlrec->data, sizeof(newmap)); + + /* We need to construct the pathname for this database */ + dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid); + + /* + * Write out the new map and send sinval, but of course don't + * write a new WAL entry. There's no surrounding transaction + * to tell to preserve files, either. + * + * There shouldn't be anyone else updating relmaps during WAL replay, + * so we don't bother to take the RelationMappingLock. We would + * need to do so if load_relmap_file needed to interlock against + * writers. + */ + write_relmap_file((xlrec->dbid == InvalidOid), &newmap, + false, true, false, + xlrec->dbid, xlrec->tsid, dbpath); + + pfree(dbpath); + } + else + elog(PANIC, "relmap_redo: unknown op code %u", info); +} + +void +relmap_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_RELMAP_UPDATE) + { + xl_relmap_update *xlrec = (xl_relmap_update *) rec; + + appendStringInfo(buf, "update relmap: database %u tablespace %u size %u", + xlrec->dbid, xlrec->tsid, xlrec->nbytes); + } + else + appendStringInfo(buf, "UNKNOWN"); +} diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 21664d8c7e2..31bdc65ec16 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/init/miscinit.c,v 1.180 2010/01/02 16:57:56 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/init/miscinit.c,v 1.181 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -64,62 +64,6 @@ static char socketLockFile[MAXPGPATH]; bool IgnoreSystemIndexes = false; -/* ---------------------------------------------------------------- - * system index reindexing support - * - * When we are busy reindexing a system index, this code provides support - * for preventing catalog lookups from using that index. - * ---------------------------------------------------------------- - */ - -static Oid currentlyReindexedHeap = InvalidOid; -static Oid currentlyReindexedIndex = InvalidOid; - -/* - * ReindexIsProcessingHeap - * True if heap specified by OID is currently being reindexed. - */ -bool -ReindexIsProcessingHeap(Oid heapOid) -{ - return heapOid == currentlyReindexedHeap; -} - -/* - * ReindexIsProcessingIndex - * True if index specified by OID is currently being reindexed. - */ -bool -ReindexIsProcessingIndex(Oid indexOid) -{ - return indexOid == currentlyReindexedIndex; -} - -/* - * SetReindexProcessing - * Set flag that specified heap/index are being reindexed. - */ -void -SetReindexProcessing(Oid heapOid, Oid indexOid) -{ - Assert(OidIsValid(heapOid) && OidIsValid(indexOid)); - /* Reindexing is not re-entrant. */ - if (OidIsValid(currentlyReindexedIndex)) - elog(ERROR, "cannot reindex while reindexing"); - currentlyReindexedHeap = heapOid; - currentlyReindexedIndex = indexOid; -} - -/* - * ResetReindexProcessing - * Unset reindexing status. - */ -void -ResetReindexProcessing(void) -{ - currentlyReindexedHeap = InvalidOid; - currentlyReindexedIndex = InvalidOid; -} /* ---------------------------------------------------------------- * database path / name support stuff diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index ae04b6f2878..538a518ef55 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -12,7 +12,7 @@ * by PostgreSQL * * IDENTIFICATION - * $PostgreSQL: pgsql/src/bin/pg_dump/pg_dump.c,v 1.569 2010/01/28 23:21:12 petere Exp $ + * $PostgreSQL: pgsql/src/bin/pg_dump/pg_dump.c,v 1.570 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2300,6 +2300,12 @@ binary_upgrade_set_relfilenodes(PQExpBuffer upgrade_buffer, Oid pg_class_oid, Oid pg_class_reltoastrelid; Oid pg_class_reltoastidxid; + /* + * Note: we don't need to use pg_relation_filenode() here because this + * function is not intended to be used against system catalogs. + * Otherwise we'd have to worry about which versions pg_relation_filenode + * is available in. + */ appendPQExpBuffer(upgrade_query, "SELECT c.relfilenode, c.reltoastrelid, t.reltoastidxid " "FROM pg_catalog.pg_class c LEFT JOIN " diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index 32b1bd535cc..72ee757f70f 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.20 2009/12/19 01:32:42 sriggs Exp $ + * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.21 2010/02/07 20:48:11 tgl Exp $ */ #ifndef RMGR_H #define RMGR_H @@ -23,6 +23,7 @@ typedef uint8 RmgrId; #define RM_DBASE_ID 4 #define RM_TBLSPC_ID 5 #define RM_MULTIXACT_ID 6 +#define RM_RELMAP_ID 7 #define RM_STANDBY_ID 8 #define RM_HEAP2_ID 9 #define RM_HEAP_ID 10 diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index 236983fe07a..b8401df7722 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catalog.h,v 1.47 2010/01/12 02:42:52 momjian Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catalog.h,v 1.48 2010/02/07 20:48:11 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -45,7 +45,6 @@ extern bool IsSharedRelation(Oid relationId); extern Oid GetNewOid(Relation relation); extern Oid GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn); -extern Oid GetNewRelFileNode(Oid reltablespace, bool relisshared, - Relation pg_class); +extern Oid GetNewRelFileNode(Oid reltablespace, Relation pg_class); #endif /* CATALOG_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index e12ec58ed69..4a4ea6b492b 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.582 2010/02/01 03:14:43 itagaki Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.583 2010/02/07 20:48:11 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201002011 +#define CATALOG_VERSION_NO 201002071 #endif diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h index 9c16737adaa..d733dbb32e3 100644 --- a/src/include/catalog/heap.h +++ b/src/include/catalog/heap.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.96 2010/01/28 23:21:12 petere Exp $ + * $PostgreSQL: pgsql/src/include/catalog/heap.h,v 1.97 2010/02/07 20:48:11 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -41,6 +41,7 @@ extern Relation heap_create(const char *relname, TupleDesc tupDesc, char relkind, bool shared_relation, + bool mapped_relation, bool allow_system_table_mods); extern Oid heap_create_with_catalog(const char *relname, @@ -54,6 +55,7 @@ extern Oid heap_create_with_catalog(const char *relname, List *cooked_constraints, char relkind, bool shared_relation, + bool mapped_relation, bool oidislocal, int oidinhcount, OnCommitAction oncommit, @@ -109,8 +111,10 @@ extern Form_pg_attribute SystemAttributeDefinition(AttrNumber attno, extern Form_pg_attribute SystemAttributeByName(const char *attname, bool relhasoids); -extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind); +extern void CheckAttributeNamesTypes(TupleDesc tupdesc, char relkind, + bool allow_system_table_mods); -extern void CheckAttributeType(const char *attname, Oid atttypid); +extern void CheckAttributeType(const char *attname, Oid atttypid, + bool allow_system_table_mods); #endif /* HEAP_H */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index bdb1c71a734..2bacf827c9f 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/index.h,v 1.81 2010/02/03 01:14:17 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/index.h,v 1.82 2010/02/07 20:48:11 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -71,6 +71,9 @@ extern double IndexBuildHeapScan(Relation heapRelation, extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot); extern void reindex_index(Oid indexId); -extern bool reindex_relation(Oid relid, bool toast_too); +extern bool reindex_relation(Oid relid, bool toast_too, bool heap_rebuilt); + +extern bool ReindexIsProcessingHeap(Oid heapOid); +extern bool ReindexIsProcessingIndex(Oid indexOid); #endif /* INDEX_H */ diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index aa35109edc7..00d0dbc975e 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_class.h,v 1.120 2010/01/28 23:21:12 petere Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_class.h,v 1.121 2010/02/07 20:48:11 tgl Exp $ * * NOTES * the genbki.pl script reads this file and generates .bki @@ -38,6 +38,7 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO Oid relowner; /* class owner */ Oid relam; /* index access method; 0 if not an index */ Oid relfilenode; /* identifier of physical storage file */ + /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ Oid reltablespace; /* identifier of table space for relation */ int4 relpages; /* # of blocks (not always up-to-date) */ float4 reltuples; /* # of tuples (not always up-to-date) */ @@ -128,13 +129,13 @@ typedef FormData_pg_class *Form_pg_class; */ /* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */ -DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 1247 0 0 0 0 0 f f f r 28 0 t f f f f f 3 _null_ _null_ )); +DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 f f f r 28 0 t f f f f f 3 _null_ _null_ )); DESCR(""); -DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 1249 0 0 0 0 0 f f f r 19 0 f f f f f f 3 _null_ _null_ )); +DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 f f f r 19 0 f f f f f f 3 _null_ _null_ )); DESCR(""); -DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 1255 0 0 0 0 0 f f f r 25 0 t f f f f f 3 _null_ _null_ )); +DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 f f f r 25 0 t f f f f f 3 _null_ _null_ )); DESCR(""); -DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 1259 0 0 0 0 0 f f f r 27 0 t f f f f f 3 _null_ _null_ )); +DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 f f f r 27 0 t f f f f f 3 _null_ _null_ )); DESCR(""); #define RELKIND_INDEX 'i' /* secondary index */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index ec45367b4ae..727b13e264c 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.567 2010/02/01 03:14:44 itagaki Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.568 2010/02/07 20:48:11 tgl Exp $ * * NOTES * The script catalog/genbki.pl reads this file and generates .bki @@ -3741,6 +3741,10 @@ DATA(insert OID = 2997 ( pg_table_size PGNSP PGUID 12 1 0 0 f f f t f v 1 0 20 DESCR("disk space usage for the specified table, including TOAST, free space and visibility map"); DATA(insert OID = 2998 ( pg_indexes_size PGNSP PGUID 12 1 0 0 f f f t f v 1 0 20 "2205" _null_ _null_ _null_ _null_ pg_indexes_size _null_ _null_ _null_ )); DESCR("disk space usage for all indexes attached to the specified table"); +DATA(insert OID = 2999 ( pg_relation_filenode PGNSP PGUID 12 1 0 0 f f f t f s 1 0 26 "2205" _null_ _null_ _null_ _null_ pg_relation_filenode _null_ _null_ _null_ )); +DESCR("filenode identifier of relation"); +DATA(insert OID = 3034 ( pg_relation_filepath PGNSP PGUID 12 1 0 0 f f f t f s 1 0 25 "2205" _null_ _null_ _null_ _null_ pg_relation_filepath _null_ _null_ _null_ )); +DESCR("file path of relation"); DATA(insert OID = 2316 ( postgresql_fdw_validator PGNSP PGUID 12 1 0 0 f f f t f i 2 0 16 "1009 26" _null_ _null_ _null_ _null_ postgresql_fdw_validator _null_ _null_ _null_)); diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index e6cafd8216f..f86cf9bbf54 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/storage.h,v 1.4 2010/01/02 16:58:02 momjian Exp $ + * $PostgreSQL: pgsql/src/include/catalog/storage.h,v 1.5 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,6 +22,7 @@ extern void RelationCreateStorage(RelFileNode rnode, bool istemp); extern void RelationDropStorage(Relation rel); +extern void RelationPreserveStorage(RelFileNode rnode); extern void RelationTruncate(Relation rel, BlockNumber nblocks); /* diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index f5357814367..0fecd1986ac 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994-5, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/commands/cluster.h,v 1.39 2010/02/04 00:09:14 tgl Exp $ + * $PostgreSQL: pgsql/src/include/commands/cluster.h,v 1.40 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -25,9 +25,9 @@ extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid, extern void mark_index_clustered(Relation rel, Oid indexOid); extern Oid make_new_heap(Oid OIDOldHeap, Oid NewTableSpace); -extern void swap_relation_files(Oid r1, Oid r2, bool swap_toast_by_content, - TransactionId frozenXid); -extern void cleanup_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, - bool swap_toast_by_content); +extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, + bool is_system_catalog, + bool swap_toast_by_content, + TransactionId frozenXid); #endif /* CLUSTER_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index f47106f6cce..2face3a3bdb 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -13,7 +13,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/miscadmin.h,v 1.217 2010/01/02 16:58:00 momjian Exp $ + * $PostgreSQL: pgsql/src/include/miscadmin.h,v 1.218 2010/02/07 20:48:13 tgl Exp $ * * NOTES * some of the information in this file should be moved to other files. @@ -347,10 +347,6 @@ extern PGDLLIMPORT bool process_shared_preload_libraries_in_progress; extern char *shared_preload_libraries_string; extern char *local_preload_libraries_string; -extern void SetReindexProcessing(Oid heapOid, Oid indexOid); -extern void ResetReindexProcessing(void); -extern bool ReindexIsProcessingHeap(Oid heapOid); -extern bool ReindexIsProcessingIndex(Oid indexOid); extern void CreateDataDirLockFile(bool amPostmaster); extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster); extern void TouchSocketLockFile(void); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 74e98eca541..f0beb20a24b 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.43 2010/01/02 16:58:08 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.44 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -67,6 +67,7 @@ typedef enum LWLockId AutovacuumLock, AutovacuumScheduleLock, SyncScanLock, + RelationMappingLock, /* Individual lock IDs end here */ FirstBufMappingLock, FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS, diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index e2088270d05..b5e4e1134db 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.24 2010/01/02 16:58:08 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/relfilenode.h,v 1.25 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,6 +61,10 @@ typedef enum ForkNumber * identified by pg_database.dattablespace). However this shorthand * is NOT allowed in RelFileNode structs --- the real tablespace ID * must be supplied when setting spcNode. + * + * Note: in pg_class, relfilenode can be zero to denote that the relation + * is a "mapped" relation, whose current true filenode number is available + * from relmapper.c. Again, this case is NOT allowed in RelFileNodes. */ typedef struct RelFileNode { diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h index 9f7bb2b2eea..bad8f505427 100644 --- a/src/include/storage/sinval.h +++ b/src/include/storage/sinval.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/sinval.h,v 1.56 2010/01/09 16:49:27 sriggs Exp $ + * $PostgreSQL: pgsql/src/include/storage/sinval.h,v 1.57 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,14 +19,16 @@ /* - * We currently support three types of shared-invalidation messages: one that - * invalidates an entry in a catcache, one that invalidates a relcache entry, - * and one that invalidates an smgr cache entry. More types could be added - * if needed. The message type is identified by the first "int16" field of - * the message struct. Zero or positive means a catcache inval message (and - * also serves as the catcache ID field). -1 means a relcache inval message. - * -2 means an smgr inval message. Other negative values are available to - * identify other inval message types. + * We support several types of shared-invalidation messages: + * * invalidate a specific tuple in a specific catcache + * * invalidate all catcache entries from a given system catalog + * * invalidate a relcache entry for a specific logical relation + * * invalidate an smgr cache entry for a specific physical relation + * * invalidate the mapped-relation mapping for a given database + * More types could be added if needed. The message type is identified by + * the first "int16" field of the message struct. Zero or positive means a + * specific-catcache inval message (and also serves as the catcache ID field). + * Negative values identify the other message types, as per codes below. * * Catcache inval events are initially driven by detecting tuple inserts, * updates and deletions in system catalogs (see CacheInvalidateHeapTuple). @@ -46,6 +48,16 @@ * and so that negative cache entries can be recognized with good accuracy. * (Of course this assumes that all the backends are using identical hashing * code, but that should be OK.) + * + * Catcache and relcache invalidations are transactional, and so are sent + * to other backends upon commit. Internally to the generating backend, + * they are also processed at CommandCounterIncrement so that later commands + * in the same transaction see the new state. The generating backend also + * has to process them at abort, to flush out any cache state it's loaded + * from no-longer-valid entries. + * + * smgr and relation mapping invalidations are non-transactional: they are + * sent immediately when the underlying file change is made. */ typedef struct @@ -57,7 +69,16 @@ typedef struct uint32 hashValue; /* hash value of key for this catcache */ } SharedInvalCatcacheMsg; -#define SHAREDINVALRELCACHE_ID (-1) +#define SHAREDINVALCATALOG_ID (-1) + +typedef struct +{ + int16 id; /* type field --- must be first */ + Oid dbId; /* database ID, or 0 if a shared catalog */ + Oid catId; /* ID of catalog whose contents are invalid */ +} SharedInvalCatalogMsg; + +#define SHAREDINVALRELCACHE_ID (-2) typedef struct { @@ -66,7 +87,7 @@ typedef struct Oid relId; /* relation ID */ } SharedInvalRelcacheMsg; -#define SHAREDINVALSMGR_ID (-2) +#define SHAREDINVALSMGR_ID (-3) typedef struct { @@ -74,12 +95,22 @@ typedef struct RelFileNode rnode; /* physical file ID */ } SharedInvalSmgrMsg; +#define SHAREDINVALRELMAP_ID (-4) + +typedef struct +{ + int16 id; /* type field --- must be first */ + Oid dbId; /* database ID, or 0 for shared catalogs */ +} SharedInvalRelmapMsg; + typedef union { int16 id; /* type field --- must be first */ SharedInvalCatcacheMsg cc; + SharedInvalCatalogMsg cat; SharedInvalRelcacheMsg rc; SharedInvalSmgrMsg sm; + SharedInvalRelmapMsg rm; } SharedInvalidationMessage; diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 6381c3d7356..a6a4284b44a 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.346 2010/02/01 03:14:45 itagaki Exp $ + * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.347 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -435,6 +435,8 @@ extern Datum pg_total_relation_size(PG_FUNCTION_ARGS); extern Datum pg_size_pretty(PG_FUNCTION_ARGS); extern Datum pg_table_size(PG_FUNCTION_ARGS); extern Datum pg_indexes_size(PG_FUNCTION_ARGS); +extern Datum pg_relation_filenode(PG_FUNCTION_ARGS); +extern Datum pg_relation_filepath(PG_FUNCTION_ARGS); /* genfile.c */ extern Datum pg_stat_file(PG_FUNCTION_ARGS); diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index b8e945e8ccf..6d77c4a7d1c 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -13,7 +13,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/catcache.h,v 1.69 2010/01/02 16:58:10 momjian Exp $ + * $PostgreSQL: pgsql/src/include/utils/catcache.h,v 1.70 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -180,6 +180,7 @@ extern void ReleaseCatCacheList(CatCList *list); extern void ResetCatalogCaches(void); extern void CatalogCacheFlushRelation(Oid relId); +extern void CatalogCacheFlushCatalog(Oid catId); extern void CatalogCacheIdInvalidate(int cacheId, uint32 hashValue, ItemPointer pointer); extern void PrepareToInvalidateCacheTuple(Relation relation, diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index dc35160ffef..1a9bbe5b38a 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.47 2010/02/03 01:14:17 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.48 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -45,6 +45,8 @@ extern void EndNonTransactionalInvalidation(void); extern void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple); +extern void CacheInvalidateCatalog(Oid catalogId); + extern void CacheInvalidateRelcache(Relation relation); extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); @@ -53,6 +55,8 @@ extern void CacheInvalidateRelcacheByRelid(Oid relid); extern void CacheInvalidateSmgr(RelFileNode rnode); +extern void CacheInvalidateRelmap(Oid databaseId); + extern void CacheRegisterSyscacheCallback(int cacheid, SyscacheCallbackFunction func, Datum arg); @@ -60,6 +64,8 @@ extern void CacheRegisterSyscacheCallback(int cacheid, extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, Datum arg); +extern void CallSyscacheCallbacks(int cacheid, ItemPointer tuplePtr); + extern void inval_twophase_postcommit(TransactionId xid, uint16 info, void *recdata, uint32 len); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 3f5795d0eae..c4a1fcf7b64 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.121 2010/02/04 00:09:14 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.122 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -340,6 +340,16 @@ typedef struct StdRdOptions ((relation)->rd_rel->relnamespace) /* + * RelationIsMapped + * True if the relation uses the relfilenode map. + * + * NB: this is only meaningful for relkinds that have storage, else it + * will misleadingly say "true". + */ +#define RelationIsMapped(relation) \ + ((relation)->rd_rel->relfilenode == InvalidOid) + +/* * RelationOpenSmgr * Open the relation at the smgr level, if not already done. */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 2e48250cbf3..74d6af01bab 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.67 2010/02/03 01:14:17 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.68 2010/02/07 20:48:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -68,7 +68,8 @@ extern Relation RelationBuildLocalRelation(const char *relname, TupleDesc tupDesc, Oid relid, Oid reltablespace, - bool shared_relation); + bool shared_relation, + bool mapped_relation); /* * Routine to manage assignment of new relfilenode to a relation @@ -85,6 +86,8 @@ extern void RelationCacheInvalidateEntry(Oid relationId); extern void RelationCacheInvalidate(void); +extern void RelationCloseSmgrByOid(Oid relationId); + extern void AtEOXact_RelationCache(bool isCommit); extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid); diff --git a/src/include/utils/relmapper.h b/src/include/utils/relmapper.h new file mode 100644 index 00000000000..6bd1f6ba403 --- /dev/null +++ b/src/include/utils/relmapper.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * relmapper.h + * Catalog-to-filenode mapping + * + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL: pgsql/src/include/utils/relmapper.h,v 1.1 2010/02/07 20:48:13 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef RELMAPPER_H +#define RELMAPPER_H + +#include "access/xlog.h" + +/* ---------------- + * relmap-related XLOG entries + * ---------------- + */ + +#define XLOG_RELMAP_UPDATE 0x00 + +typedef struct xl_relmap_update +{ + Oid dbid; /* database ID, or 0 for shared map */ + Oid tsid; /* database's tablespace, or pg_global */ + int32 nbytes; /* size of relmap data */ + char data[1]; /* VARIABLE LENGTH ARRAY */ +} xl_relmap_update; + +#define MinSizeOfRelmapUpdate offsetof(xl_relmap_update, data) + + +extern Oid RelationMapOidToFilenode(Oid relationId, bool shared); + +extern void RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, + bool immediate); + +extern void RelationMapRemoveMapping(Oid relationId); + +extern void RelationMapInvalidate(bool shared); +extern void RelationMapInvalidateAll(void); + +extern void AtCCI_RelationMap(void); +extern void AtEOXact_RelationMap(bool isCommit); +extern void AtPrepare_RelationMap(void); + +extern void CheckPointRelationMap(void); + +extern void RelationMapFinishBootstrap(void); + +extern void RelationMapInitialize(void); +extern void RelationMapInitializePhase2(void); +extern void RelationMapInitializePhase3(void); + +extern void relmap_redo(XLogRecPtr lsn, XLogRecord *record); +extern void relmap_desc(StringInfo buf, uint8 xl_info, char *rec); + +#endif /* RELMAPPER_H */ diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out index e94e1d538f8..40db5df47ec 100644 --- a/src/test/regress/expected/vacuum.out +++ b/src/test/regress/expected/vacuum.out @@ -108,7 +108,7 @@ SELECT relid, ORDER BY relid::text; relid | cluster | full_inplace | full -------------+---------+--------------+------ - pg_am | t | t | t + pg_am | t | t | f pg_class | t | t | t pg_database | t | t | t vaccluster | f | t | f diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index fa5f507e45c..eb53eff4b44 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -1,5 +1,5 @@ # ---------- -# $PostgreSQL: pgsql/src/test/regress/parallel_schedule,v 1.58 2010/01/28 23:21:13 petere Exp $ +# $PostgreSQL: pgsql/src/test/regress/parallel_schedule,v 1.59 2010/02/07 20:48:13 tgl Exp $ # # By convention, we put no more than twenty tests in any one parallel group; # this limits the number of connections needed to run the tests. @@ -52,7 +52,10 @@ test: copy copyselect # ---------- # Another group of parallel tests # ---------- -test: constraints triggers create_misc create_aggregate create_operator inherit typed_table vacuum drop_if_exists create_cast +test: constraints triggers create_misc create_aggregate create_operator inherit typed_table drop_if_exists create_cast + +# XXX temporarily run this by itself +test: vacuum # Depends on the above test: create_index create_view |