summaryrefslogtreecommitdiff
path: root/src/backend/replication/pgoutput/pgoutput.c
diff options
context:
space:
mode:
authorPeter Eisentraut2020-04-08 07:59:27 +0000
committerPeter Eisentraut2020-04-08 09:19:23 +0000
commit83fd4532a72179c370e318075a10e0e2aa832024 (patch)
tree9c3c582fe39c51278949eb4b5cd0cbcb0ddd685a /src/backend/replication/pgoutput/pgoutput.c
parent1aac32df89eb19949050f6f27c268122833ad036 (diff)
Allow publishing partition changes via ancestors
To control whether partition changes are replicated using their own identity and schema or an ancestor's, add a new parameter that can be set per publication named 'publish_via_partition_root'. This allows replicating a partitioned table into a different partition structure on the subscriber. Author: Amit Langote <[email protected]> Reviewed-by: Rafia Sabih <[email protected]> Reviewed-by: Peter Eisentraut <[email protected]> Reviewed-by: Petr Jelinek <[email protected]> Discussion: https://www.postgresql.org/message-id/flat/CA+HiwqH=Y85vRK3mOdjEkqFK+E=ST=eQiHdpj43L=_eJMOOznQ@mail.gmail.com
Diffstat (limited to 'src/backend/replication/pgoutput/pgoutput.c')
-rw-r--r--src/backend/replication/pgoutput/pgoutput.c223
1 files changed, 182 insertions, 41 deletions
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index 552a70cffa5..5fbf2d4367b 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -12,6 +12,8 @@
*/
#include "postgres.h"
+#include "access/tupconvert.h"
+#include "catalog/partition.h"
#include "catalog/pg_publication.h"
#include "fmgr.h"
#include "replication/logical.h"
@@ -20,6 +22,7 @@
#include "replication/pgoutput.h"
#include "utils/int8.h"
#include "utils/inval.h"
+#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/syscache.h"
#include "utils/varlena.h"
@@ -49,6 +52,7 @@ static bool publications_valid;
static List *LoadPublications(List *pubnames);
static void publication_invalidation_cb(Datum arg, int cacheid,
uint32 hashvalue);
+static void send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx);
/*
* Entry in the map used to remember which relation schemas we sent.
@@ -59,9 +63,31 @@ static void publication_invalidation_cb(Datum arg, int cacheid,
typedef struct RelationSyncEntry
{
Oid relid; /* relation oid */
- bool schema_sent; /* did we send the schema? */
+
+ /*
+ * Did we send the schema? If ancestor relid is set, its schema must also
+ * have been sent for this to be true.
+ */
+ bool schema_sent;
+
bool replicate_valid;
PublicationActions pubactions;
+
+ /*
+ * OID of the relation to publish changes as. For a partition, this may
+ * be set to one of its ancestors whose schema will be used when
+ * replicating changes, if publish_via_partition_root is set for the
+ * publication.
+ */
+ Oid publish_as_relid;
+
+ /*
+ * Map used when replicating using an ancestor's schema to convert tuples
+ * from partition's type to the ancestor's; NULL if publish_as_relid is
+ * same as 'relid' or if unnecessary due to partition and the ancestor
+ * having identical TupleDesc.
+ */
+ TupleConversionMap *map;
} RelationSyncEntry;
/* Map used to remember which relation schemas we sent. */
@@ -259,47 +285,71 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
}
/*
- * Write the relation schema if the current schema hasn't been sent yet.
+ * Write the current schema of the relation and its ancestor (if any) if not
+ * done yet.
*/
static void
maybe_send_schema(LogicalDecodingContext *ctx,
Relation relation, RelationSyncEntry *relentry)
{
- if (!relentry->schema_sent)
+ if (relentry->schema_sent)
+ return;
+
+ /* If needed, send the ancestor's schema first. */
+ if (relentry->publish_as_relid != RelationGetRelid(relation))
{
- TupleDesc desc;
- int i;
+ Relation ancestor = RelationIdGetRelation(relentry->publish_as_relid);
+ TupleDesc indesc = RelationGetDescr(relation);
+ TupleDesc outdesc = RelationGetDescr(ancestor);
+ MemoryContext oldctx;
+
+ /* Map must live as long as the session does. */
+ oldctx = MemoryContextSwitchTo(CacheMemoryContext);
+ relentry->map = convert_tuples_by_name(indesc, outdesc);
+ MemoryContextSwitchTo(oldctx);
+ send_relation_and_attrs(ancestor, ctx);
+ RelationClose(ancestor);
+ }
- desc = RelationGetDescr(relation);
+ send_relation_and_attrs(relation, ctx);
+ relentry->schema_sent = true;
+}
- /*
- * Write out type info if needed. We do that only for user-created
- * types. We use FirstGenbkiObjectId as the cutoff, so that we only
- * consider objects with hand-assigned OIDs to be "built in", not for
- * instance any function or type defined in the information_schema.
- * This is important because only hand-assigned OIDs can be expected
- * to remain stable across major versions.
- */
- for (i = 0; i < desc->natts; i++)
- {
- Form_pg_attribute att = TupleDescAttr(desc, i);
+/*
+ * Sends a relation
+ */
+static void
+send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx)
+{
+ TupleDesc desc = RelationGetDescr(relation);
+ int i;
- if (att->attisdropped || att->attgenerated)
- continue;
+ /*
+ * Write out type info if needed. We do that only for user-created types.
+ * We use FirstGenbkiObjectId as the cutoff, so that we only consider
+ * objects with hand-assigned OIDs to be "built in", not for instance any
+ * function or type defined in the information_schema. This is important
+ * because only hand-assigned OIDs can be expected to remain stable across
+ * major versions.
+ */
+ for (i = 0; i < desc->natts; i++)
+ {
+ Form_pg_attribute att = TupleDescAttr(desc, i);
- if (att->atttypid < FirstGenbkiObjectId)
- continue;
+ if (att->attisdropped || att->attgenerated)
+ continue;
- OutputPluginPrepareWrite(ctx, false);
- logicalrep_write_typ(ctx->out, att->atttypid);
- OutputPluginWrite(ctx, false);
- }
+ if (att->atttypid < FirstGenbkiObjectId)
+ continue;
OutputPluginPrepareWrite(ctx, false);
- logicalrep_write_rel(ctx->out, relation);
+ logicalrep_write_typ(ctx->out, att->atttypid);
OutputPluginWrite(ctx, false);
- relentry->schema_sent = true;
}
+
+ OutputPluginPrepareWrite(ctx, false);
+ logicalrep_write_rel(ctx->out, relation);
+ OutputPluginWrite(ctx, false);
}
/*
@@ -346,28 +396,65 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
switch (change->action)
{
case REORDER_BUFFER_CHANGE_INSERT:
- OutputPluginPrepareWrite(ctx, true);
- logicalrep_write_insert(ctx->out, relation,
- &change->data.tp.newtuple->tuple);
- OutputPluginWrite(ctx, true);
- break;
+ {
+ HeapTuple tuple = &change->data.tp.newtuple->tuple;
+
+ /* Switch relation if publishing via root. */
+ if (relentry->publish_as_relid != RelationGetRelid(relation))
+ {
+ Assert(relation->rd_rel->relispartition);
+ relation = RelationIdGetRelation(relentry->publish_as_relid);
+ /* Convert tuple if needed. */
+ if (relentry->map)
+ tuple = execute_attr_map_tuple(tuple, relentry->map);
+ }
+
+ OutputPluginPrepareWrite(ctx, true);
+ logicalrep_write_insert(ctx->out, relation, tuple);
+ OutputPluginWrite(ctx, true);
+ break;
+ }
case REORDER_BUFFER_CHANGE_UPDATE:
{
HeapTuple oldtuple = change->data.tp.oldtuple ?
&change->data.tp.oldtuple->tuple : NULL;
+ HeapTuple newtuple = &change->data.tp.newtuple->tuple;
+
+ /* Switch relation if publishing via root. */
+ if (relentry->publish_as_relid != RelationGetRelid(relation))
+ {
+ Assert(relation->rd_rel->relispartition);
+ relation = RelationIdGetRelation(relentry->publish_as_relid);
+ /* Convert tuples if needed. */
+ if (relentry->map)
+ {
+ oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
+ newtuple = execute_attr_map_tuple(newtuple, relentry->map);
+ }
+ }
OutputPluginPrepareWrite(ctx, true);
- logicalrep_write_update(ctx->out, relation, oldtuple,
- &change->data.tp.newtuple->tuple);
+ logicalrep_write_update(ctx->out, relation, oldtuple, newtuple);
OutputPluginWrite(ctx, true);
break;
}
case REORDER_BUFFER_CHANGE_DELETE:
if (change->data.tp.oldtuple)
{
+ HeapTuple oldtuple = &change->data.tp.oldtuple->tuple;
+
+ /* Switch relation if publishing via root. */
+ if (relentry->publish_as_relid != RelationGetRelid(relation))
+ {
+ Assert(relation->rd_rel->relispartition);
+ relation = RelationIdGetRelation(relentry->publish_as_relid);
+ /* Convert tuple if needed. */
+ if (relentry->map)
+ oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
+ }
+
OutputPluginPrepareWrite(ctx, true);
- logicalrep_write_delete(ctx->out, relation,
- &change->data.tp.oldtuple->tuple);
+ logicalrep_write_delete(ctx->out, relation, oldtuple);
OutputPluginWrite(ctx, true);
}
else
@@ -412,10 +499,11 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
continue;
/*
- * Don't send partitioned tables, because partitions should be sent
- * instead.
+ * Don't send partitions if the publication wants to send only the
+ * root tables through it.
*/
- if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ if (relation->rd_rel->relispartition &&
+ relentry->publish_as_relid != relid)
continue;
relids[nrelids++] = relid;
@@ -540,12 +628,15 @@ init_rel_sync_cache(MemoryContext cachectx)
* This looks up publications that the given relation is directly or
* indirectly part of (the latter if it's really the relation's ancestor that
* is part of a publication) and fills up the found entry with the information
- * about which operations to publish.
+ * about which operations to publish and whether to use an ancestor's schema
+ * when publishing.
*/
static RelationSyncEntry *
get_rel_sync_entry(PGOutputData *data, Oid relid)
{
RelationSyncEntry *entry;
+ bool am_partition = get_rel_relispartition(relid);
+ char relkind = get_rel_relkind(relid);
bool found;
MemoryContext oldctx;
@@ -564,6 +655,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
{
List *pubids = GetRelationPublications(relid);
ListCell *lc;
+ Oid publish_as_relid = relid;
/* Reload publications if needed before use. */
if (!publications_valid)
@@ -588,8 +680,56 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
foreach(lc, data->publications)
{
Publication *pub = lfirst(lc);
+ bool publish = false;
+
+ if (pub->alltables)
+ {
+ publish = true;
+ if (pub->pubviaroot && am_partition)
+ publish_as_relid = llast_oid(get_partition_ancestors(relid));
+ }
+
+ if (!publish)
+ {
+ bool ancestor_published = false;
+
+ /*
+ * For a partition, check if any of the ancestors are
+ * published. If so, note down the topmost ancestor that is
+ * published via this publication, which will be used as the
+ * relation via which to publish the partition's changes.
+ */
+ if (am_partition)
+ {
+ List *ancestors = get_partition_ancestors(relid);
+ ListCell *lc2;
+
+ /* Find the "topmost" ancestor that is in this publication. */
+ foreach(lc2, ancestors)
+ {
+ Oid ancestor = lfirst_oid(lc2);
+
+ if (list_member_oid(GetRelationPublications(ancestor),
+ pub->oid))
+ {
+ ancestor_published = true;
+ if (pub->pubviaroot)
+ publish_as_relid = ancestor;
+ }
+ }
+ }
+
+ if (list_member_oid(pubids, pub->oid) || ancestor_published)
+ publish = true;
+ }
- if (pub->alltables || list_member_oid(pubids, pub->oid))
+ /*
+ * Don't publish changes for partitioned tables, because
+ * publishing those of its partitions suffices, unless partition
+ * changes won't be published due to pubviaroot being set.
+ */
+ if (publish &&
+ (relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot))
{
entry->pubactions.pubinsert |= pub->pubactions.pubinsert;
entry->pubactions.pubupdate |= pub->pubactions.pubupdate;
@@ -604,6 +744,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
list_free(pubids);
+ entry->publish_as_relid = publish_as_relid;
entry->replicate_valid = true;
}