[PGPRO-3146] The logic for restarting the receivers and the resolver was rewritten.

danolivo · kelvich · commit a58213a4ced2 · 2019-11-05T18:37:33.000+03:00
Postmaster do not restarts resolver and receivers if they exit.
At each cycle of the MtmMonitor we check the state of these processes and re-launch them.
diff --git a/src/pglogical_receiver.c b/src/pglogical_receiver.c
@@ -547,6 +547,7 @@ pglogical_receiver_main(Datum main_arg)
 	snprintf(worker_proc, BGW_MAXLEN, "mtm-logrep-receiver-%d-%d",
 			 receiver_mtm_cfg->my_node_id, nodeId);
 	BgwPoolStart(&Mtm->pools[nodeId-1], worker_proc, db_id, user_id);
+	mtm_log(MtmReceiverStart, "Receiver %s has started.", worker_proc);
 
 	/*
 	 * This is the main loop of logical replication.
@@ -968,7 +969,7 @@ pglogical_receiver_main(Datum main_arg)
 		 */
 		BgwPoolCancel(&Mtm->pools[nodeId - 1]);
 		MtmSleep(RECEIVER_SUSPEND_TIMEOUT);
-
+		mtm_log(MtmApplyError, "Receiver %s catch an error and will die", worker_proc);
 		/* and die */
 		PG_RE_THROW();
 	}
@@ -989,7 +990,7 @@ MtmStartReceiver(int nodeId, Oid db_id, Oid user_id, pid_t monitor_pid)
 	MemSet(&worker, 0, sizeof(BackgroundWorker));
 	worker.bgw_flags = BGWORKER_SHMEM_ACCESS |	BGWORKER_BACKEND_DATABASE_CONNECTION;
 	worker.bgw_start_time = BgWorkerStart_ConsistentState;
-	worker.bgw_restart_time = 1;
+	worker.bgw_restart_time = BGW_NEVER_RESTART;
 	worker.bgw_main_arg = Int32GetDatum(nodeId);
 	worker.bgw_notify_pid = monitor_pid;
 
diff --git a/src/resolver.c b/src/resolver.c
@@ -135,7 +135,7 @@ ResolverStart(Oid db_id, Oid user_id)
 	MemSet(&worker, 0, sizeof(BackgroundWorker));
 	worker.bgw_flags = BGWORKER_SHMEM_ACCESS |	BGWORKER_BACKEND_DATABASE_CONNECTION;
 	worker.bgw_start_time = BgWorkerStart_ConsistentState;
-	worker.bgw_restart_time = 1;
+	worker.bgw_restart_time = BGW_NEVER_RESTART;
 
 	memcpy(worker.bgw_extra, &db_id, sizeof(Oid));
 	memcpy(worker.bgw_extra + sizeof(Oid), &user_id, sizeof(Oid));
@@ -561,6 +561,7 @@ ResolverMain(Datum main_arg)
 	LWLockAcquire(resolver_state->lock, LW_EXCLUSIVE);
 	resolver_state->pid = MyProcPid;
 	LWLockRelease(resolver_state->lock);
+	mtm_log(ResolverTraceTxMsg, "Resolver started");
 
 	for(;;)
 	{
diff --git a/src/state.c b/src/state.c
@@ -1193,7 +1193,7 @@ MtmMonitorStart(Oid db_id, Oid user_id)
 	MemSet(&worker, 0, sizeof(BackgroundWorker));
 	worker.bgw_flags = BGWORKER_SHMEM_ACCESS |	BGWORKER_BACKEND_DATABASE_CONNECTION;
 	worker.bgw_start_time = BgWorkerStart_ConsistentState;
-	worker.bgw_restart_time = BGW_NEVER_RESTART; /* or we can start several receivers */
+	worker.bgw_restart_time = 1;
 	worker.bgw_main_arg = Int32GetDatum(0);
 
 	memcpy(worker.bgw_extra, &db_id, sizeof(Oid));
@@ -1658,10 +1658,13 @@ MtmMonitor(Datum arg)
 	/* Launch resolver */
 	Assert(resolver == NULL);
 	resolver = ResolverStart(db_id, user_id);
+	mtm_log(MtmStateMessage, "MtmMonitor started");
 
 	for (;;)
 	{
 		int rc;
+		int i;
+		pid_t pid;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -1712,6 +1715,29 @@ MtmMonitor(Datum arg)
 			config_valid = true;
 		}
 
+		/*
+		 * Check and restart resolver and receivers if its stopped by any error.
+		 */
+		if (GetBackgroundWorkerPid(resolver, &pid) == BGWH_STOPPED)
+		{
+			mtm_log(MtmStateMessage, "Restart resolver");
+			resolver = ResolverStart(db_id, user_id);
+		}
+
+		for (i = 0; i < MTM_MAX_NODES; i++)
+		{
+			if (receivers[i] == NULL)
+				continue;
+
+			if (GetBackgroundWorkerPid(receivers[i], &pid) == BGWH_STOPPED)
+			{
+				mtm_log(MtmStateMessage, "Restart receiver for the node%d", i + 1);
+				/* Receiver has finished by some kind of mistake. Start it. */
+				receivers[i] = MtmStartReceiver(i+1, MyDatabaseId,
+														GetUserId(), MyProcPid);
+			}
+		}
+
 		// XXX: add tx start/stop to clear mcxt?
 		check_status_requests(mtm_cfg);
 
diff --git a/t/007_bugfixes.pl b/t/007_bugfixes.pl
@@ -3,7 +3,7 @@
 use PostgresNode;
 use Cluster;
 use TestLib;
-use Test::More tests => 2;
+use Test::More tests => 3;
 
 my $cluster = new Cluster(3);
 $cluster->init();
@@ -90,5 +90,38 @@
 is( (($hash0 eq $hash1) and ($hash1 eq $hash2)) , 1,
     "Check that hash is the same after query");
 
+# ##############################################################################
+#
+# Check the PGPRO-3146 bug. Hard crash of backend causes restart of all postgres
+# processes. Multimaster node must be survived after the crash and included into
+# the multimaster after recovery.
+#
+# ##############################################################################
+
+# Set GUC restart_after_crash in 'on' value
+$cluster->stop();
+foreach (0..$#{$cluster->{nodes}})
+{
+	$cluster->{nodes}->[$_]->append_conf('postgresql.conf', q{restart_after_crash = on});
+}
+$cluster->start();
+$cluster->await_nodes( (0,1,2) );
+
+# Simulate payload
+$cluster->pgbench(0, ('-i', '-n', -s => '1') );
+my $pgb1 = $cluster->pgbench_async(0, ('-n', -T => '15', -j=>'5', -c => '5') );
+sleep(5);
+
+my $pid0 = $cluster->safe_psql(0, "SELECT pid FROM pg_stat_activity
+	WHERE	backend_type LIKE 'client backend'
+			AND query LIKE 'UPDATE%' LIMIT 1;");
+
+# Simulate hard crash
+note("Simulate hard crash of a backend by SIGKILL to $pid0");
+kill -9, $pid0;
+
+$cluster->await_nodes( (0,1,2) );
+is($cluster->is_data_identic( (0,1,2) ), 1, "check consistency after crash");
+
 $cluster->stop();