Allow publishing partition changes via ancestors

To control whether partition changes are replicated using their own identity and schema or an ancestor's, add a new parameter that can be set per publication named 'publish_via_partition_root'. This allows replicating a partitioned table into a different partition structure on the subscriber. Author: Amit Langote <[email protected]> Reviewed-by: Rafia Sabih <[email protected]> Reviewed-by: Peter Eisentraut <[email protected]> Reviewed-by: Petr Jelinek <[email protected]> Discussion: https://www.postgresql.org/message-id/flat/CA+HiwqH=Y85vRK3mOdjEkqFK+E=ST=eQiHdpj43L=_eJMOOznQ@mail.gmail.com
author: Peter Eisentraut 2020-04-08 07:59:27 +0000
committer: Peter Eisentraut 2020-04-08 09:19:23 +0000
commit: 83fd4532a72179c370e318075a10e0e2aa832024 (patch)
tree: 9c3c582fe39c51278949eb4b5cd0cbcb0ddd685a /src/backend/replication/pgoutput/pgoutput.c
parent: 1aac32df89eb19949050f6f27c268122833ad036 (diff)
1 files changed, 182 insertions, 41 deletions
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index 552a70cffa5..5fbf2d4367b 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -12,6 +12,8 @@
  */
 #include "postgres.h"
 
+#include "access/tupconvert.h"
+#include "catalog/partition.h"
 #include "catalog/pg_publication.h"
 #include "fmgr.h"
 #include "replication/logical.h"
@@ -20,6 +22,7 @@
 #include "replication/pgoutput.h"
 #include "utils/int8.h"
 #include "utils/inval.h"
+#include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
 #include "utils/varlena.h"
@@ -49,6 +52,7 @@ static bool publications_valid;
 static List *LoadPublications(List *pubnames);
 static void publication_invalidation_cb(Datum arg, int cacheid,
 										uint32 hashvalue);
+static void send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx);
 
 /*
  * Entry in the map used to remember which relation schemas we sent.
@@ -59,9 +63,31 @@ static void publication_invalidation_cb(Datum arg, int cacheid,
 typedef struct RelationSyncEntry
 {
 	Oid			relid;			/* relation oid */
-	bool		schema_sent;	/* did we send the schema? */
+
+	/*
+	 * Did we send the schema?  If ancestor relid is set, its schema must also
+	 * have been sent for this to be true.
+	 */
+	bool		schema_sent;
+
 	bool		replicate_valid;
 	PublicationActions pubactions;
+
+	/*
+	 * OID of the relation to publish changes as.  For a partition, this may
+	 * be set to one of its ancestors whose schema will be used when
+	 * replicating changes, if publish_via_partition_root is set for the
+	 * publication.
+	 */
+	Oid			publish_as_relid;
+
+	/*
+	 * Map used when replicating using an ancestor's schema to convert tuples
+	 * from partition's type to the ancestor's; NULL if publish_as_relid is
+	 * same as 'relid' or if unnecessary due to partition and the ancestor
+	 * having identical TupleDesc.
+	 */
+	TupleConversionMap *map;
 } RelationSyncEntry;
 
 /* Map used to remember which relation schemas we sent. */
@@ -259,47 +285,71 @@ pgoutput_commit_txn(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 }
 
 /*
- * Write the relation schema if the current schema hasn't been sent yet.
+ * Write the current schema of the relation and its ancestor (if any) if not
+ * done yet.
  */
 static void
 maybe_send_schema(LogicalDecodingContext *ctx,
 				  Relation relation, RelationSyncEntry *relentry)
 {
-	if (!relentry->schema_sent)
+	if (relentry->schema_sent)
+		return;
+
+	/* If needed, send the ancestor's schema first. */
+	if (relentry->publish_as_relid != RelationGetRelid(relation))
 	{
-		TupleDesc	desc;
-		int			i;
+		Relation	ancestor = RelationIdGetRelation(relentry->publish_as_relid);
+		TupleDesc	indesc = RelationGetDescr(relation);
+		TupleDesc	outdesc = RelationGetDescr(ancestor);
+		MemoryContext oldctx;
+
+		/* Map must live as long as the session does. */
+		oldctx = MemoryContextSwitchTo(CacheMemoryContext);
+		relentry->map = convert_tuples_by_name(indesc, outdesc);
+		MemoryContextSwitchTo(oldctx);
+		send_relation_and_attrs(ancestor, ctx);
+		RelationClose(ancestor);
+	}
 
-		desc = RelationGetDescr(relation);
+	send_relation_and_attrs(relation, ctx);
+	relentry->schema_sent = true;
+}
 
-		/*
-		 * Write out type info if needed.  We do that only for user-created
-		 * types.  We use FirstGenbkiObjectId as the cutoff, so that we only
-		 * consider objects with hand-assigned OIDs to be "built in", not for
-		 * instance any function or type defined in the information_schema.
-		 * This is important because only hand-assigned OIDs can be expected
-		 * to remain stable across major versions.
-		 */
-		for (i = 0; i < desc->natts; i++)
-		{
-			Form_pg_attribute att = TupleDescAttr(desc, i);
+/*
+ * Sends a relation
+ */
+static void
+send_relation_and_attrs(Relation relation, LogicalDecodingContext *ctx)
+{
+	TupleDesc	desc = RelationGetDescr(relation);
+	int			i;
 
-			if (att->attisdropped || att->attgenerated)
-				continue;
+	/*
+	 * Write out type info if needed.  We do that only for user-created types.
+	 * We use FirstGenbkiObjectId as the cutoff, so that we only consider
+	 * objects with hand-assigned OIDs to be "built in", not for instance any
+	 * function or type defined in the information_schema. This is important
+	 * because only hand-assigned OIDs can be expected to remain stable across
+	 * major versions.
+	 */
+	for (i = 0; i < desc->natts; i++)
+	{
+		Form_pg_attribute att = TupleDescAttr(desc, i);
 
-			if (att->atttypid < FirstGenbkiObjectId)
-				continue;
+		if (att->attisdropped || att->attgenerated)
+			continue;
 
-			OutputPluginPrepareWrite(ctx, false);
-			logicalrep_write_typ(ctx->out, att->atttypid);
-			OutputPluginWrite(ctx, false);
-		}
+		if (att->atttypid < FirstGenbkiObjectId)
+			continue;
 
 		OutputPluginPrepareWrite(ctx, false);
-		logicalrep_write_rel(ctx->out, relation);
+		logicalrep_write_typ(ctx->out, att->atttypid);
 		OutputPluginWrite(ctx, false);
-		relentry->schema_sent = true;
 	}
+
+	OutputPluginPrepareWrite(ctx, false);
+	logicalrep_write_rel(ctx->out, relation);
+	OutputPluginWrite(ctx, false);
 }
 
 /*
@@ -346,28 +396,65 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 	switch (change->action)
 	{
 		case REORDER_BUFFER_CHANGE_INSERT:
-			OutputPluginPrepareWrite(ctx, true);
-			logicalrep_write_insert(ctx->out, relation,
-									&change->data.tp.newtuple->tuple);
-			OutputPluginWrite(ctx, true);
-			break;
+			{
+				HeapTuple	tuple = &change->data.tp.newtuple->tuple;
+
+				/* Switch relation if publishing via root. */
+				if (relentry->publish_as_relid != RelationGetRelid(relation))
+				{
+					Assert(relation->rd_rel->relispartition);
+					relation = RelationIdGetRelation(relentry->publish_as_relid);
+					/* Convert tuple if needed. */
+					if (relentry->map)
+						tuple = execute_attr_map_tuple(tuple, relentry->map);
+				}
+
+				OutputPluginPrepareWrite(ctx, true);
+				logicalrep_write_insert(ctx->out, relation, tuple);
+				OutputPluginWrite(ctx, true);
+				break;
+			}
 		case REORDER_BUFFER_CHANGE_UPDATE:
 			{
 				HeapTuple	oldtuple = change->data.tp.oldtuple ?
 				&change->data.tp.oldtuple->tuple : NULL;
+				HeapTuple	newtuple = &change->data.tp.newtuple->tuple;
+
+				/* Switch relation if publishing via root. */
+				if (relentry->publish_as_relid != RelationGetRelid(relation))
+				{
+					Assert(relation->rd_rel->relispartition);
+					relation = RelationIdGetRelation(relentry->publish_as_relid);
+					/* Convert tuples if needed. */
+					if (relentry->map)
+					{
+						oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
+						newtuple = execute_attr_map_tuple(newtuple, relentry->map);
+					}
+				}
 
 				OutputPluginPrepareWrite(ctx, true);
-				logicalrep_write_update(ctx->out, relation, oldtuple,
-										&change->data.tp.newtuple->tuple);
+				logicalrep_write_update(ctx->out, relation, oldtuple, newtuple);
 				OutputPluginWrite(ctx, true);
 				break;
 			}
 		case REORDER_BUFFER_CHANGE_DELETE:
 			if (change->data.tp.oldtuple)
 			{
+				HeapTuple	oldtuple = &change->data.tp.oldtuple->tuple;
+
+				/* Switch relation if publishing via root. */
+				if (relentry->publish_as_relid != RelationGetRelid(relation))
+				{
+					Assert(relation->rd_rel->relispartition);
+					relation = RelationIdGetRelation(relentry->publish_as_relid);
+					/* Convert tuple if needed. */
+					if (relentry->map)
+						oldtuple = execute_attr_map_tuple(oldtuple, relentry->map);
+				}
+
 				OutputPluginPrepareWrite(ctx, true);
-				logicalrep_write_delete(ctx->out, relation,
-										&change->data.tp.oldtuple->tuple);
+				logicalrep_write_delete(ctx->out, relation, oldtuple);
 				OutputPluginWrite(ctx, true);
 			}
 			else
@@ -412,10 +499,11 @@ pgoutput_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 			continue;
 
 		/*
-		 * Don't send partitioned tables, because partitions should be sent
-		 * instead.
+		 * Don't send partitions if the publication wants to send only the
+		 * root tables through it.
 		 */
-		if (relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		if (relation->rd_rel->relispartition &&
+			relentry->publish_as_relid != relid)
 			continue;
 
 		relids[nrelids++] = relid;
@@ -540,12 +628,15 @@ init_rel_sync_cache(MemoryContext cachectx)
  * This looks up publications that the given relation is directly or
  * indirectly part of (the latter if it's really the relation's ancestor that
  * is part of a publication) and fills up the found entry with the information
- * about which operations to publish.
+ * about which operations to publish and whether to use an ancestor's schema
+ * when publishing.
  */
 static RelationSyncEntry *
 get_rel_sync_entry(PGOutputData *data, Oid relid)
 {
 	RelationSyncEntry *entry;
+	bool		am_partition = get_rel_relispartition(relid);
+	char		relkind = get_rel_relkind(relid);
 	bool		found;
 	MemoryContext oldctx;
 
@@ -564,6 +655,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
 	{
 		List	   *pubids = GetRelationPublications(relid);
 		ListCell   *lc;
+		Oid			publish_as_relid = relid;
 
 		/* Reload publications if needed before use. */
 		if (!publications_valid)
@@ -588,8 +680,56 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
 		foreach(lc, data->publications)
 		{
 			Publication *pub = lfirst(lc);
+			bool		publish = false;
+
+			if (pub->alltables)
+			{
+				publish = true;
+				if (pub->pubviaroot && am_partition)
+					publish_as_relid = llast_oid(get_partition_ancestors(relid));
+			}
+
+			if (!publish)
+			{
+				bool	ancestor_published = false;
+
+				/*
+				 * For a partition, check if any of the ancestors are
+				 * published.  If so, note down the topmost ancestor that is
+				 * published via this publication, which will be used as the
+				 * relation via which to publish the partition's changes.
+				 */
+				if (am_partition)
+				{
+					List   *ancestors = get_partition_ancestors(relid);
+					ListCell *lc2;
+
+					/* Find the "topmost" ancestor that is in this publication. */
+					foreach(lc2, ancestors)
+					{
+						Oid		ancestor = lfirst_oid(lc2);
+
+						if (list_member_oid(GetRelationPublications(ancestor),
+											pub->oid))
+						{
+							ancestor_published = true;
+							if (pub->pubviaroot)
+								publish_as_relid = ancestor;
+						}
+					}
+				}
+
+				if (list_member_oid(pubids, pub->oid) || ancestor_published)
+					publish = true;
+			}
 
-			if (pub->alltables || list_member_oid(pubids, pub->oid))
+			/*
+			 * Don't publish changes for partitioned tables, because
+			 * publishing those of its partitions suffices, unless partition
+			 * changes won't be published due to pubviaroot being set.
+			 */
+			if (publish &&
+				(relkind != RELKIND_PARTITIONED_TABLE || pub->pubviaroot))
 			{
 				entry->pubactions.pubinsert |= pub->pubactions.pubinsert;
 				entry->pubactions.pubupdate |= pub->pubactions.pubupdate;
@@ -604,6 +744,7 @@ get_rel_sync_entry(PGOutputData *data, Oid relid)
 
 		list_free(pubids);
 
+		entry->publish_as_relid = publish_as_relid;
 		entry->replicate_valid = true;
 	}
author	Peter Eisentraut	2020-04-08 07:59:27 +0000
committer	Peter Eisentraut	2020-04-08 09:19:23 +0000
commit	83fd4532a72179c370e318075a10e0e2aa832024 (patch)
tree	9c3c582fe39c51278949eb4b5cd0cbcb0ddd685a /src/backend/replication/pgoutput/pgoutput.c
parent	1aac32df89eb19949050f6f27c268122833ad036 (diff)