diff options
author | Peter Eisentraut | 2020-04-08 07:59:27 +0000 |
---|---|---|
committer | Peter Eisentraut | 2020-04-08 09:19:23 +0000 |
commit | 83fd4532a72179c370e318075a10e0e2aa832024 (patch) | |
tree | 9c3c582fe39c51278949eb4b5cd0cbcb0ddd685a /src/backend/replication/pgoutput/pgoutput.c | |
parent | 1aac32df89eb19949050f6f27c268122833ad036 (diff) |
Allow publishing partition changes via ancestors
To control whether partition changes are replicated using their own
identity and schema or an ancestor's, add a new parameter that can be
set per publication named 'publish_via_partition_root'.
This allows replicating a partitioned table into a different partition
structure on the subscriber.
Author: Amit Langote <[email protected]>
Reviewed-by: Rafia Sabih <[email protected]>
Reviewed-by: Peter Eisentraut <[email protected]>
Reviewed-by: Petr Jelinek <[email protected]>
Discussion: https://www.postgresql.org/message-id/flat/CA+HiwqH=Y85vRK3mOdjEkqFK+E=ST=eQiHdpj43L=_eJMOOznQ@mail.gmail.com
Diffstat (limited to 'src/backend/replication/pgoutput/pgoutput.c')
-rw-r--r-- | src/backend/replication/pgoutput/pgoutput.c | 223 |
1 files changed, 182 insertions, 41 deletions
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 552a70cffa5..5fbf2d4367b 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -12,6 +12,8 @@ */ #include "postgres.h" +#include "access/tupconvert.h" +#include "catalog/partition.h" #include "catalog/pg_publication.h" #include "fmgr.h" #include "replication/logical.h" @@ -20,6 +22,7 @@ #include "replication/pgoutput.h" #include "utils/int8.h" #include "utils/inval.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/syscache.h" #include "utils/varlena.h" @@ -49,6 +52,7 @@ static bool publications_valid; static List *LoadPublications(List *pubnames); static void publication_invalidation_cb(Datum arg, int cacheid, uint32 hashvalue); +static void send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx); /* * Entry in the map used to remember which relation schemas we sent. @@ -59,9 +63,31 @@ static void publication_invalidation_cb(Datum arg, int cacheid, typedef struct RelationSyncEntry { Oid relid; /* relation oid */ - bool schema_sent; /* did we send the schema? */ + + /* + * Did we send the schema? If ancestor relid is set, its schema must also + * have been sent for this to be true. + */ + bool schema_sent; + bool replicate_valid; PublicationActions pubactions; + + /* + * OID of the relation to publish changes as. For a partition, this may + * be set to one of its ancestors whose schema will be used when + * replicating changes, if publish_via_partition_root is set for the + * publication. + */ + Oid publish_as_relid; + + /* + * Map used when replicating using an ancestor's schema to convert tuples + * from partition's type to the ancestor's; NULL if publish_as_relid is + * same as 'relid' or if unnecessary due to partition and the ancestor + * having identical TupleDesc. + */ + TupleConversionMap *map; } RelationSyncEntry; /* Map used to remember which relation schemas we sent. */ @@ -259,47 +285,71 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, } /* - * Write the relation schema if the current schema hasn't been sent yet. + * Write the current schema of the relation and its ancestor (if any) if not + * done yet. */ static void maybe_send_schema(LogicalDecodingContext *ctx, Relation relation, RelationSyncEntry *relentry) { - if (!relentry->schema_sent) + if (relentry->schema_sent) + return; + + /* If needed, send the ancestor's schema first. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) { - TupleDesc desc; - int i; + Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid); + TupleDesc indesc = RelationGetDescr(relation); + TupleDesc outdesc = RelationGetDescr(ancestor); + MemoryContext oldctx; + + /* Map must live as long as the session does. */ + oldctx = MemoryContextSwitchTo(CacheMemoryContext); + relentry->map = convert_tuples_by_name(indesc, outdesc); + MemoryContextSwitchTo(oldctx); + send_relation_and_attrs(ancestor, ctx); + RelationClose(ancestor); + } - desc = RelationGetDescr(relation); + send_relation_and_attrs(relation, ctx); + relentry->schema_sent = true; +} - /* - * Write out type info if needed. We do that only for user-created - * types. We use FirstGenbkiObjectId as the cutoff, so that we only - * consider objects with hand-assigned OIDs to be "built in", not for - * instance any function or type defined in the information_schema. - * This is important because only hand-assigned OIDs can be expected - * to remain stable across major versions. - */ - for (i = 0; i < desc->natts; i++) - { - Form_pg_attribute att = TupleDescAttr(desc, i); +/* + * Sends a relation + */ +static void +send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx) +{ + TupleDesc desc = RelationGetDescr(relation); + int i; - if (att->attisdropped || att->attgenerated) - continue; + /* + * Write out type info if needed. We do that only for user-created types. + * We use FirstGenbkiObjectId as the cutoff, so that we only consider + * objects with hand-assigned OIDs to be "built in", not for instance any + * function or type defined in the information_schema. This is important + * because only hand-assigned OIDs can be expected to remain stable across + * major versions. + */ + for (i = 0; i < desc->natts; i++) + { + Form_pg_attribute att = TupleDescAttr(desc, i); - if (att->atttypid < FirstGenbkiObjectId) - continue; + if (att->attisdropped || att->attgenerated) + continue; - OutputPluginPrepareWrite(ctx, false); - logicalrep_write_typ(ctx->out, att->atttypid); - OutputPluginWrite(ctx, false); - } + if (att->atttypid < FirstGenbkiObjectId) + continue; OutputPluginPrepareWrite(ctx, false); - logicalrep_write_rel(ctx->out, relation); + logicalrep_write_typ(ctx->out, att->atttypid); OutputPluginWrite(ctx, false); - relentry->schema_sent = true; } + + OutputPluginPrepareWrite(ctx, false); + logicalrep_write_rel(ctx->out, relation); + OutputPluginWrite(ctx, false); } /* @@ -346,28 +396,65 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, switch (change->action) { case REORDER_BUFFER_CHANGE_INSERT: - OutputPluginPrepareWrite(ctx, true); - logicalrep_write_insert(ctx->out, relation, - &change->data.tp.newtuple->tuple); - OutputPluginWrite(ctx, true); - break; + { + HeapTuple tuple = &change->data.tp.newtuple->tuple; + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + relation = RelationIdGetRelation(relentry->publish_as_relid); + /* Convert tuple if needed. */ + if (relentry->map) + tuple = execute_attr_map_tuple(tuple, relentry->map); + } + + OutputPluginPrepareWrite(ctx, true); + logicalrep_write_insert(ctx->out, relation, tuple); + OutputPluginWrite(ctx, true); + break; + } case REORDER_BUFFER_CHANGE_UPDATE: { HeapTuple oldtuple = change->data.tp.oldtuple ? &change->data.tp.oldtuple->tuple : NULL; + HeapTuple newtuple = &change->data.tp.newtuple->tuple; + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + relation = RelationIdGetRelation(relentry->publish_as_relid); + /* Convert tuples if needed. */ + if (relentry->map) + { + oldtuple = execute_attr_map_tuple(oldtuple, relentry->map); + newtuple = execute_attr_map_tuple(newtuple, relentry->map); + } + } OutputPluginPrepareWrite(ctx, true); - logicalrep_write_update(ctx->out, relation, oldtuple, - &change->data.tp.newtuple->tuple); + logicalrep_write_update(ctx->out, relation, oldtuple, newtuple); OutputPluginWrite(ctx, true); break; } case REORDER_BUFFER_CHANGE_DELETE: if (change->data.tp.oldtuple) { + HeapTuple oldtuple = &change->data.tp.oldtuple->tuple; + + /* Switch relation if publishing via root. */ + if (relentry->publish_as_relid != RelationGetRelid(relation)) + { + Assert(relation->rd_rel->relispartition); + relation = RelationIdGetRelation(relentry->publish_as_relid); + /* Convert tuple if needed. */ + if (relentry->map) + oldtuple = execute_attr_map_tuple(oldtuple, relentry->map); + } + OutputPluginPrepareWrite(ctx, true); - logicalrep_write_delete(ctx->out, relation, - &change->data.tp.oldtuple->tuple); + logicalrep_write_delete(ctx->out, relation, oldtuple); OutputPluginWrite(ctx, true); } else @@ -412,10 +499,11 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn, continue; /* - * Don't send partitioned tables, because partitions should be sent - * instead. + * Don't send partitions if the publication wants to send only the + * root tables through it. */ - if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + if (relation->rd_rel->relispartition && + relentry->publish_as_relid != relid) continue; relids[nrelids++] = relid; @@ -540,12 +628,15 @@ init_rel_sync_cache(MemoryContext cachectx) * This looks up publications that the given relation is directly or * indirectly part of (the latter if it's really the relation's ancestor that * is part of a publication) and fills up the found entry with the information - * about which operations to publish. + * about which operations to publish and whether to use an ancestor's schema + * when publishing. */ static RelationSyncEntry * get_rel_sync_entry(PGOutputData *data, Oid relid) { RelationSyncEntry *entry; + bool am_partition = get_rel_relispartition(relid); + char relkind = get_rel_relkind(relid); bool found; MemoryContext oldctx; @@ -564,6 +655,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid) { List *pubids = GetRelationPublications(relid); ListCell *lc; + Oid publish_as_relid = relid; /* Reload publications if needed before use. */ if (!publications_valid) @@ -588,8 +680,56 @@ get_rel_sync_entry(PGOutputData *data, Oid relid) foreach(lc, data->publications) { Publication *pub = lfirst(lc); + bool publish = false; + + if (pub->alltables) + { + publish = true; + if (pub->pubviaroot && am_partition) + publish_as_relid = llast_oid(get_partition_ancestors(relid)); + } + + if (!publish) + { + bool ancestor_published = false; + + /* + * For a partition, check if any of the ancestors are + * published. If so, note down the topmost ancestor that is + * published via this publication, which will be used as the + * relation via which to publish the partition's changes. + */ + if (am_partition) + { + List *ancestors = get_partition_ancestors(relid); + ListCell *lc2; + + /* Find the "topmost" ancestor that is in this publication. */ + foreach(lc2, ancestors) + { + Oid ancestor = lfirst_oid(lc2); + + if (list_member_oid(GetRelationPublications(ancestor), + pub->oid)) + { + ancestor_published = true; + if (pub->pubviaroot) + publish_as_relid = ancestor; + } + } + } + + if (list_member_oid(pubids, pub->oid) || ancestor_published) + publish = true; + } - if (pub->alltables || list_member_oid(pubids, pub->oid)) + /* + * Don't publish changes for partitioned tables, because + * publishing those of its partitions suffices, unless partition + * changes won't be published due to pubviaroot being set. + */ + if (publish && + (relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot)) { entry->pubactions.pubinsert |= pub->pubactions.pubinsert; entry->pubactions.pubupdate |= pub->pubactions.pubupdate; @@ -604,6 +744,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid) list_free(pubids); + entry->publish_as_relid = publish_as_relid; entry->replicate_valid = true; } |