Prevent excessive delays before launching new logrep workers.

author Tom Lane <[email protected]>

Tue, 24 Jun 2025 18:14:04 +0000 (14:14 -0400)

committer Tom Lane <[email protected]>

Tue, 24 Jun 2025 18:14:07 +0000 (14:14 -0400)
author Tom Lane <[email protected]>
Tue, 24 Jun 2025 18:14:04 +0000 (14:14 -0400)
committer Tom Lane <[email protected]>
Tue, 24 Jun 2025 18:14:07 +0000 (14:14 -0400)
diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c

index 1c3c051403dd638d257483495657f432297f37f7..14d8efbd25bf5317338a12a8c86978f7810ead9c 100644 (file)
--- a/src/backend/replication/logical/launcher.c
+++ b/src/backend/replication/logical/launcher.c
@@ -175,12 +175,14 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
                                uint16 generation,
                                BackgroundWorkerHandle *handle)
  {
-   BgwHandleStatus status;
-   int         rc;
+   bool        result = false;
+   bool        dropped_latch = false;
  
     for (;;)
     {
+       BgwHandleStatus status;
         pid_t       pid;
+       int         rc;
  
         CHECK_FOR_INTERRUPTS();
  
@@ -189,8 +191,9 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
         /* Worker either died or has started. Return false if died. */
         if (!worker->in_use || worker->proc)
         {
+           result = worker->in_use;
             LWLockRelease(LogicalRepWorkerLock);
-           return worker->in_use;
+           break;
         }
  
         LWLockRelease(LogicalRepWorkerLock);
@@ -205,7 +208,7 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
             if (generation == worker->generation)
                 logicalrep_worker_cleanup(worker);
             LWLockRelease(LogicalRepWorkerLock);
-           return false;
+           break;              /* result is already false */
         }
  
         /*
@@ -220,8 +223,18 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
         {
             ResetLatch(MyLatch);
             CHECK_FOR_INTERRUPTS();
+           dropped_latch = true;
         }
     }
+
+   /*
+    * If we had to clear a latch event in order to wait, be sure to restore
+    * it before exiting.  Otherwise caller may miss events.
+    */
+   if (dropped_latch)
+       SetLatch(MyLatch);
+
+   return result;
  }
  
  /*
@@ -1194,10 +1207,21 @@ ApplyLauncherMain(Datum main_arg)
                 (elapsed = TimestampDifferenceMilliseconds(last_start, now)) >= wal_retrieve_retry_interval)
             {
                 ApplyLauncherSetWorkerStartTime(sub->oid, now);
-               logicalrep_worker_launch(WORKERTYPE_APPLY,
-                                        sub->dbid, sub->oid, sub->name,
-                                        sub->owner, InvalidOid,
-                                        DSM_HANDLE_INVALID);
+               if (!logicalrep_worker_launch(WORKERTYPE_APPLY,
+                                             sub->dbid, sub->oid, sub->name,
+                                             sub->owner, InvalidOid,
+                                             DSM_HANDLE_INVALID))
+               {
+                   /*
+                    * We get here either if we failed to launch a worker
+                    * (perhaps for resource-exhaustion reasons) or if we
+                    * launched one but it immediately quit.  Either way, it
+                    * seems appropriate to try again after
+                    * wal_retrieve_retry_interval.
+                    */
+                   wait_time = Min(wait_time,
+                                   wal_retrieve_retry_interval);
+               }
             }
             else
             {
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c

index 8e1e8762f6258fd7e3a97623d60fa75e554bbb03..c90f23ee5b0b233c29bed1fcc0554e507baa3293 100644 (file)
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -603,14 +603,19 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
                         TimestampDifferenceExceeds(hentry->last_start_time, now,
                                                    wal_retrieve_retry_interval))
                     {
-                       logicalrep_worker_launch(WORKERTYPE_TABLESYNC,
-                                                MyLogicalRepWorker->dbid,
-                                                MySubscription->oid,
-                                                MySubscription->name,
-                                                MyLogicalRepWorker->userid,
-                                                rstate->relid,
-                                                DSM_HANDLE_INVALID);
+                       /*
+                        * Set the last_start_time even if we fail to start
+                        * the worker, so that we won't retry until
+                        * wal_retrieve_retry_interval has elapsed.
+                        */
                         hentry->last_start_time = now;
+                       (void) logicalrep_worker_launch(WORKERTYPE_TABLESYNC,
+                                                       MyLogicalRepWorker->dbid,
+                                                       MySubscription->oid,
+                                                       MySubscription->name,
+                                                       MyLogicalRepWorker->userid,
+                                                       rstate->relid,
+                                                       DSM_HANDLE_INVALID);
                     }
                 }
             }
author	Tom Lane <[email protected]>
	Tue, 24 Jun 2025 18:14:04 +0000 (14:14 -0400)
committer	Tom Lane <[email protected]>
	Tue, 24 Jun 2025 18:14:07 +0000 (14:14 -0400)
src/backend/replication/logical/launcher.c		patch \| blob \| blame \| history
src/backend/replication/logical/tablesync.c		patch \| blob \| blame \| history