1 files changed, 935 insertions, 0 deletions
diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c
new file mode 100644
index 00000000000..f5c03b54b48
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_relation.c
@@ -0,0 +1,935 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_relation.c
+ *	  Implementation of relation statistics.
+ *
+ * This file contains the implementation of function relation. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/activity/pgstat_relation.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "postmaster/autovacuum.h"
+#include "utils/memutils.h"
+#include "utils/pgstat_internal.h"
+#include "utils/rel.h"
+#include "utils/timestamp.h"
+
+
+/*
+ * Structures in which backends store per-table info that's waiting to be
+ * sent to the collector.
+ *
+ * NOTE: once allocated, TabStatusArray structures are never moved or deleted
+ * for the life of the backend.  Also, we zero out the t_id fields of the
+ * contained PgStat_TableStatus structs whenever they are not actively in use.
+ * This allows relcache pgstat_info pointers to be treated as long-lived data,
+ * avoiding repeated searches in pgstat_relation_init() when a relation is
+ * repeatedly opened during a transaction.
+ */
+#define TABSTAT_QUANTUM		100 /* we alloc this many at a time */
+
+
+typedef struct TabStatusArray
+{
+	struct TabStatusArray *tsa_next;	/* link to next array, if any */
+	int			tsa_used;		/* # entries currently used */
+	PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM];	/* per-table data */
+} TabStatusArray;
+
+static TabStatusArray *pgStatTabList = NULL;
+
+/*
+ * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer
+ */
+typedef struct TabStatHashEntry
+{
+	Oid			t_id;
+	PgStat_TableStatus *tsa_entry;
+} TabStatHashEntry;
+
+/* Record that's written to 2PC state file when pgstat state is persisted */
+typedef struct TwoPhasePgStatRecord
+{
+	PgStat_Counter tuples_inserted; /* tuples inserted in xact */
+	PgStat_Counter tuples_updated;	/* tuples updated in xact */
+	PgStat_Counter tuples_deleted;	/* tuples deleted in xact */
+	/* tuples i/u/d prior to truncate/drop */
+	PgStat_Counter inserted_pre_truncdrop;
+	PgStat_Counter updated_pre_truncdrop;
+	PgStat_Counter deleted_pre_truncdrop;
+	Oid			t_id;			/* table's OID */
+	bool		t_shared;		/* is it a shared catalog? */
+	bool		t_truncdropped; /* was the relation truncated/dropped? */
+} TwoPhasePgStatRecord;
+
+
+static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared);
+static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now);
+static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level);
+static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info);
+static void pgstat_truncdrop_save_counters(PgStat_TableXactStatus *trans, bool is_drop);
+static void pgstat_truncdrop_restore_counters(PgStat_TableXactStatus *trans);
+
+
+/*
+ * Indicates if backend has some relation stats that it hasn't yet
+ * sent to the collector.
+ */
+bool		have_relation_stats;
+
+
+/*
+ * Hash table for O(1) t_id -> tsa_entry lookup
+ */
+static HTAB *pgStatTabHash = NULL;
+
+
+/* ----------
+ * pgstat_relation_init() -
+ *
+ *	Initialize a relcache entry to count access statistics.
+ *	Called whenever a relation is opened.
+ *
+ *	We assume that a relcache entry's pgstat_info field is zeroed by
+ *	relcache.c when the relcache entry is made; thereafter it is long-lived
+ *	data.  We can avoid repeated searches of the TabStatus arrays when the
+ *	same relation is touched repeatedly within a transaction.
+ * ----------
+ */
+void
+pgstat_relation_init(Relation rel)
+{
+	Oid			rel_id = rel->rd_id;
+	char		relkind = rel->rd_rel->relkind;
+
+	/*
+	 * We only count stats for relations with storage and partitioned tables
+	 */
+	if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE)
+	{
+		rel->pgstat_info = NULL;
+		return;
+	}
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+	{
+		/* We're not counting at all */
+		rel->pgstat_info = NULL;
+		return;
+	}
+
+	/*
+	 * If we already set up this relation in the current transaction, nothing
+	 * to do.
+	 */
+	if (rel->pgstat_info != NULL &&
+		rel->pgstat_info->t_id == rel_id)
+		return;
+
+	/* Else find or make the PgStat_TableStatus entry, and update link */
+	rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared);
+}
+
+/* ----------
+ * pgstat_drop_relation() -
+ *
+ *	Tell the collector that we just dropped a relation.
+ *	(If the message gets lost, we will still clean the dead entry eventually
+ *	via future invocations of pgstat_vacuum_stat().)
+ *
+ *	Currently not used for lack of any good place to call it; we rely
+ *	entirely on pgstat_vacuum_stat() to clean out stats for dead rels.
+ * ----------
+ */
+#ifdef NOT_USED
+void
+pgstat_drop_relation(Oid relid)
+{
+	PgStat_MsgTabpurge msg;
+	int			len;
+
+	if (pgStatSock == PGINVALID_SOCKET)
+		return;
+
+	msg.m_tableid[0] = relid;
+	msg.m_nentries = 1;
+
+	len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid);
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE);
+	msg.m_databaseid = MyDatabaseId;
+	pgstat_send(&msg, len);
+}
+#endif							/* NOT_USED */
+
+/* ----------
+ * pgstat_report_autovac() -
+ *
+ *	Called from autovacuum.c to report startup of an autovacuum process.
+ *	We are called before InitPostgres is done, so can't rely on MyDatabaseId;
+ *	the db OID must be passed in, instead.
+ * ----------
+ */
+void
+pgstat_report_autovac(Oid dboid)
+{
+	PgStat_MsgAutovacStart msg;
+
+	if (pgStatSock == PGINVALID_SOCKET)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START);
+	msg.m_databaseid = dboid;
+	msg.m_start_time = GetCurrentTimestamp();
+
+	pgstat_send(&msg, sizeof(msg));
+}
+
+/* ---------
+ * pgstat_report_vacuum() -
+ *
+ *	Tell the collector about the table we just vacuumed.
+ * ---------
+ */
+void
+pgstat_report_vacuum(Oid tableoid, bool shared,
+					 PgStat_Counter livetuples, PgStat_Counter deadtuples)
+{
+	PgStat_MsgVacuum msg;
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+		return;
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM);
+	msg.m_databaseid = shared ? InvalidOid : MyDatabaseId;
+	msg.m_tableoid = tableoid;
+	msg.m_autovacuum = IsAutoVacuumWorkerProcess();
+	msg.m_vacuumtime = GetCurrentTimestamp();
+	msg.m_live_tuples = livetuples;
+	msg.m_dead_tuples = deadtuples;
+	pgstat_send(&msg, sizeof(msg));
+}
+
+/* --------
+ * pgstat_report_analyze() -
+ *
+ *	Tell the collector about the table we just analyzed.
+ *
+ * Caller must provide new live- and dead-tuples estimates, as well as a
+ * flag indicating whether to reset the changes_since_analyze counter.
+ * --------
+ */
+void
+pgstat_report_analyze(Relation rel,
+					  PgStat_Counter livetuples, PgStat_Counter deadtuples,
+					  bool resetcounter)
+{
+	PgStat_MsgAnalyze msg;
+
+	if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
+		return;
+
+	/*
+	 * Unlike VACUUM, ANALYZE might be running inside a transaction that has
+	 * already inserted and/or deleted rows in the target table. ANALYZE will
+	 * have counted such rows as live or dead respectively. Because we will
+	 * report our counts of such rows at transaction end, we should subtract
+	 * off these counts from what we send to the collector now, else they'll
+	 * be double-counted after commit.  (This approach also ensures that the
+	 * collector ends up with the right numbers if we abort instead of
+	 * committing.)
+	 *
+	 * Waste no time on partitioned tables, though.
+	 */
+	if (pgstat_relation_should_count(rel) &&
+		rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+	{
+		PgStat_TableXactStatus *trans;
+
+		for (trans = rel->pgstat_info->trans; trans; trans = trans->upper)
+		{
+			livetuples -= trans->tuples_inserted - trans->tuples_deleted;
+			deadtuples -= trans->tuples_updated + trans->tuples_deleted;
+		}
+		/* count stuff inserted by already-aborted subxacts, too */
+		deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples;
+		/* Since ANALYZE's counts are estimates, we could have underflowed */
+		livetuples = Max(livetuples, 0);
+		deadtuples = Max(deadtuples, 0);
+	}
+
+	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE);
+	msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId;
+	msg.m_tableoid = RelationGetRelid(rel);
+	msg.m_autovacuum = IsAutoVacuumWorkerProcess();
+	msg.m_resetcounter = resetcounter;
+	msg.m_analyzetime = GetCurrentTimestamp();
+	msg.m_live_tuples = livetuples;
+	msg.m_dead_tuples = deadtuples;
+	pgstat_send(&msg, sizeof(msg));
+}
+
+/*
+ * pgstat_count_heap_insert - count a tuple insertion of n tuples
+ */
+void
+pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
+{
+	if (pgstat_relation_should_count(rel))
+	{
+		PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+		ensure_tabstat_xact_level(pgstat_info);
+		pgstat_info->trans->tuples_inserted += n;
+	}
+}
+
+/*
+ * pgstat_count_heap_update - count a tuple update
+ */
+void
+pgstat_count_heap_update(Relation rel, bool hot)
+{
+	if (pgstat_relation_should_count(rel))
+	{
+		PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+		ensure_tabstat_xact_level(pgstat_info);
+		pgstat_info->trans->tuples_updated++;
+
+		/* t_tuples_hot_updated is nontransactional, so just advance it */
+		if (hot)
+			pgstat_info->t_counts.t_tuples_hot_updated++;
+	}
+}
+
+/*
+ * pgstat_count_heap_delete - count a tuple deletion
+ */
+void
+pgstat_count_heap_delete(Relation rel)
+{
+	if (pgstat_relation_should_count(rel))
+	{
+		PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+		ensure_tabstat_xact_level(pgstat_info);
+		pgstat_info->trans->tuples_deleted++;
+	}
+}
+
+/*
+ * pgstat_count_truncate - update tuple counters due to truncate
+ */
+void
+pgstat_count_truncate(Relation rel)
+{
+	if (pgstat_relation_should_count(rel))
+	{
+		PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+		ensure_tabstat_xact_level(pgstat_info);
+		pgstat_truncdrop_save_counters(pgstat_info->trans, false);
+		pgstat_info->trans->tuples_inserted = 0;
+		pgstat_info->trans->tuples_updated = 0;
+		pgstat_info->trans->tuples_deleted = 0;
+	}
+}
+
+/*
+ * pgstat_update_heap_dead_tuples - update dead-tuples count
+ *
+ * The semantics of this are that we are reporting the nontransactional
+ * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases
+ * rather than increasing, and the change goes straight into the per-table
+ * counter, not into transactional state.
+ */
+void
+pgstat_update_heap_dead_tuples(Relation rel, int delta)
+{
+	if (pgstat_relation_should_count(rel))
+	{
+		PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+		pgstat_info->t_counts.t_delta_dead_tuples -= delta;
+	}
+}
+
+/*
+ * find_tabstat_entry - find any existing PgStat_TableStatus entry for rel
+ *
+ * If no entry, return NULL, don't create a new one
+ *
+ * Note: if we got an error in the most recent execution of pgstat_report_stat,
+ * it's possible that an entry exists but there's no hashtable entry for it.
+ * That's okay, we'll treat this case as "doesn't exist".
+ */
+PgStat_TableStatus *
+find_tabstat_entry(Oid rel_id)
+{
+	TabStatHashEntry *hash_entry;
+
+	/* If hashtable doesn't exist, there are no entries at all */
+	if (!pgStatTabHash)
+		return NULL;
+
+	hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL);
+	if (!hash_entry)
+		return NULL;
+
+	/* Note that this step could also return NULL, but that's correct */
+	return hash_entry->tsa_entry;
+}
+
+/*
+ * Perform relation stats specific end-of-transaction work. Helper for
+ * AtEOXact_PgStat.
+ *
+ * Transfer transactional insert/update counts into the base tabstat entries.
+ * We don't bother to free any of the transactional state, since it's all in
+ * TopTransactionContext and will go away anyway.
+ */
+void
+AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit)
+{
+	PgStat_TableXactStatus *trans;
+
+	for (trans = xact_state->first; trans != NULL; trans = trans->next)
+	{
+		PgStat_TableStatus *tabstat;
+
+		Assert(trans->nest_level == 1);
+		Assert(trans->upper == NULL);
+		tabstat = trans->parent;
+		Assert(tabstat->trans == trans);
+		/* restore pre-truncate/drop stats (if any) in case of aborted xact */
+		if (!isCommit)
+			pgstat_truncdrop_restore_counters(trans);
+		/* count attempted actions regardless of commit/abort */
+		tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
+		tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
+		tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
+		if (isCommit)
+		{
+			tabstat->t_counts.t_truncdropped = trans->truncdropped;
+			if (trans->truncdropped)
+			{
+				/* forget live/dead stats seen by backend thus far */
+				tabstat->t_counts.t_delta_live_tuples = 0;
+				tabstat->t_counts.t_delta_dead_tuples = 0;
+			}
+			/* insert adds a live tuple, delete removes one */
+			tabstat->t_counts.t_delta_live_tuples +=
+				trans->tuples_inserted - trans->tuples_deleted;
+			/* update and delete each create a dead tuple */
+			tabstat->t_counts.t_delta_dead_tuples +=
+				trans->tuples_updated + trans->tuples_deleted;
+			/* insert, update, delete each count as one change event */
+			tabstat->t_counts.t_changed_tuples +=
+				trans->tuples_inserted + trans->tuples_updated +
+				trans->tuples_deleted;
+		}
+		else
+		{
+			/* inserted tuples are dead, deleted tuples are unaffected */
+			tabstat->t_counts.t_delta_dead_tuples +=
+				trans->tuples_inserted + trans->tuples_updated;
+			/* an aborted xact generates no changed_tuple events */
+		}
+		tabstat->trans = NULL;
+	}
+}
+
+/*
+ * Perform relation stats specific end-of-sub-transaction work. Helper for
+ * AtEOSubXact_PgStat.
+ *
+ * Transfer transactional insert/update counts into the next higher
+ * subtransaction state.
+ */
+void
+AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, int nestDepth)
+{
+	PgStat_TableXactStatus *trans;
+	PgStat_TableXactStatus *next_trans;
+
+	for (trans = xact_state->first; trans != NULL; trans = next_trans)
+	{
+		PgStat_TableStatus *tabstat;
+
+		next_trans = trans->next;
+		Assert(trans->nest_level == nestDepth);
+		tabstat = trans->parent;
+		Assert(tabstat->trans == trans);
+
+		if (isCommit)
+		{
+			if (trans->upper && trans->upper->nest_level == nestDepth - 1)
+			{
+				if (trans->truncdropped)
+				{
+					/* propagate the truncate/drop status one level up */
+					pgstat_truncdrop_save_counters(trans->upper, false);
+					/* replace upper xact stats with ours */
+					trans->upper->tuples_inserted = trans->tuples_inserted;
+					trans->upper->tuples_updated = trans->tuples_updated;
+					trans->upper->tuples_deleted = trans->tuples_deleted;
+				}
+				else
+				{
+					trans->upper->tuples_inserted += trans->tuples_inserted;
+					trans->upper->tuples_updated += trans->tuples_updated;
+					trans->upper->tuples_deleted += trans->tuples_deleted;
+				}
+				tabstat->trans = trans->upper;
+				pfree(trans);
+			}
+			else
+			{
+				/*
+				 * When there isn't an immediate parent state, we can just
+				 * reuse the record instead of going through a palloc/pfree
+				 * pushup (this works since it's all in TopTransactionContext
+				 * anyway).  We have to re-link it into the parent level,
+				 * though, and that might mean pushing a new entry into the
+				 * pgStatXactStack.
+				 */
+				PgStat_SubXactStatus *upper_xact_state;
+
+				upper_xact_state = pgstat_xact_stack_level_get(nestDepth - 1);
+				trans->next = upper_xact_state->first;
+				upper_xact_state->first = trans;
+				trans->nest_level = nestDepth - 1;
+			}
+		}
+		else
+		{
+			/*
+			 * On abort, update top-level tabstat counts, then forget the
+			 * subtransaction
+			 */
+
+			/* first restore values obliterated by truncate/drop */
+			pgstat_truncdrop_restore_counters(trans);
+			/* count attempted actions regardless of commit/abort */
+			tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted;
+			tabstat->t_counts.t_tuples_updated += trans->tuples_updated;
+			tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted;
+			/* inserted tuples are dead, deleted tuples are unaffected */
+			tabstat->t_counts.t_delta_dead_tuples +=
+				trans->tuples_inserted + trans->tuples_updated;
+			tabstat->trans = trans->upper;
+			pfree(trans);
+		}
+	}
+}
+
+/*
+ * Generate 2PC records for all the pending transaction-dependent relation
+ * stats.
+ */
+void
+AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state)
+{
+	PgStat_TableXactStatus *trans;
+
+	for (trans = xact_state->first; trans != NULL; trans = trans->next)
+	{
+		PgStat_TableStatus *tabstat;
+		TwoPhasePgStatRecord record;
+
+		Assert(trans->nest_level == 1);
+		Assert(trans->upper == NULL);
+		tabstat = trans->parent;
+		Assert(tabstat->trans == trans);
+
+		record.tuples_inserted = trans->tuples_inserted;
+		record.tuples_updated = trans->tuples_updated;
+		record.tuples_deleted = trans->tuples_deleted;
+		record.inserted_pre_truncdrop = trans->inserted_pre_truncdrop;
+		record.updated_pre_truncdrop = trans->updated_pre_truncdrop;
+		record.deleted_pre_truncdrop = trans->deleted_pre_truncdrop;
+		record.t_id = tabstat->t_id;
+		record.t_shared = tabstat->t_shared;
+		record.t_truncdropped = trans->truncdropped;
+
+		RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0,
+							   &record, sizeof(TwoPhasePgStatRecord));
+	}
+}
+
+/*
+ * All we need do here is unlink the transaction stats state from the
+ * nontransactional state.  The nontransactional action counts will be
+ * reported to the stats collector immediately, while the effects on
+ * live and dead tuple counts are preserved in the 2PC state file.
+ *
+ * Note: AtEOXact_PgStat_Relations is not called during PREPARE.
+ */
+void
+PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state)
+{
+	PgStat_TableXactStatus *trans;
+
+	for (trans = xact_state->first; trans != NULL; trans = trans->next)
+	{
+		PgStat_TableStatus *tabstat;
+
+		tabstat = trans->parent;
+		tabstat->trans = NULL;
+	}
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state.
+ */
+void
+pgstat_twophase_postcommit(TransactionId xid, uint16 info,
+						   void *recdata, uint32 len)
+{
+	TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+	PgStat_TableStatus *pgstat_info;
+
+	/* Find or create a tabstat entry for the rel */
+	pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+	/* Same math as in AtEOXact_PgStat, commit case */
+	pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
+	pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
+	pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
+	pgstat_info->t_counts.t_truncdropped = rec->t_truncdropped;
+	if (rec->t_truncdropped)
+	{
+		/* forget live/dead stats seen by backend thus far */
+		pgstat_info->t_counts.t_delta_live_tuples = 0;
+		pgstat_info->t_counts.t_delta_dead_tuples = 0;
+	}
+	pgstat_info->t_counts.t_delta_live_tuples +=
+		rec->tuples_inserted - rec->tuples_deleted;
+	pgstat_info->t_counts.t_delta_dead_tuples +=
+		rec->tuples_updated + rec->tuples_deleted;
+	pgstat_info->t_counts.t_changed_tuples +=
+		rec->tuples_inserted + rec->tuples_updated +
+		rec->tuples_deleted;
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * Load the saved counts into our local pgstats state, but treat them
+ * as aborted.
+ */
+void
+pgstat_twophase_postabort(TransactionId xid, uint16 info,
+						  void *recdata, uint32 len)
+{
+	TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata;
+	PgStat_TableStatus *pgstat_info;
+
+	/* Find or create a tabstat entry for the rel */
+	pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared);
+
+	/* Same math as in AtEOXact_PgStat, abort case */
+	if (rec->t_truncdropped)
+	{
+		rec->tuples_inserted = rec->inserted_pre_truncdrop;
+		rec->tuples_updated = rec->updated_pre_truncdrop;
+		rec->tuples_deleted = rec->deleted_pre_truncdrop;
+	}
+	pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted;
+	pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated;
+	pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted;
+	pgstat_info->t_counts.t_delta_dead_tuples +=
+		rec->tuples_inserted + rec->tuples_updated;
+}
+
+/*
+ * Subroutine for pgstat_report_stat: Send relation statistics
+ */
+void
+pgstat_send_tabstats(TimestampTz now, bool disconnect)
+{
+	/* we assume this inits to all zeroes: */
+	static const PgStat_TableCounts all_zeroes;
+	PgStat_MsgTabstat regular_msg;
+	PgStat_MsgTabstat shared_msg;
+	TabStatusArray *tsa;
+	int			i;
+
+	/*
+	 * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry
+	 * entries it points to.  (Should we fail partway through the loop below,
+	 * it's okay to have removed the hashtable already --- the only
+	 * consequence is we'd get multiple entries for the same table in the
+	 * pgStatTabList, and that's safe.)
+	 */
+	if (pgStatTabHash)
+		hash_destroy(pgStatTabHash);
+	pgStatTabHash = NULL;
+
+	/*
+	 * Scan through the TabStatusArray struct(s) to find tables that actually
+	 * have counts, and build messages to send.  We have to separate shared
+	 * relations from regular ones because the databaseid field in the message
+	 * header has to depend on that.
+	 */
+	regular_msg.m_databaseid = MyDatabaseId;
+	shared_msg.m_databaseid = InvalidOid;
+	regular_msg.m_nentries = 0;
+	shared_msg.m_nentries = 0;
+
+	for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next)
+	{
+		for (i = 0; i < tsa->tsa_used; i++)
+		{
+			PgStat_TableStatus *entry = &tsa->tsa_entries[i];
+			PgStat_MsgTabstat *this_msg;
+			PgStat_TableEntry *this_ent;
+
+			/* Shouldn't have any pending transaction-dependent counts */
+			Assert(entry->trans == NULL);
+
+			/*
+			 * Ignore entries that didn't accumulate any actual counts, such
+			 * as indexes that were opened by the planner but not used.
+			 */
+			if (memcmp(&entry->t_counts, &all_zeroes,
+					   sizeof(PgStat_TableCounts)) == 0)
+				continue;
+
+			/*
+			 * OK, insert data into the appropriate message, and send if full.
+			 */
+			this_msg = entry->t_shared ? &shared_msg : &regular_msg;
+			this_ent = &this_msg->m_entry[this_msg->m_nentries];
+			this_ent->t_id = entry->t_id;
+			memcpy(&this_ent->t_counts, &entry->t_counts,
+				   sizeof(PgStat_TableCounts));
+			if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES)
+			{
+				pgstat_send_tabstat(this_msg, now);
+				this_msg->m_nentries = 0;
+			}
+		}
+		/* zero out PgStat_TableStatus structs after use */
+		MemSet(tsa->tsa_entries, 0,
+			   tsa->tsa_used * sizeof(PgStat_TableStatus));
+		tsa->tsa_used = 0;
+	}
+
+	/*
+	 * Send partial messages.  Make sure that any pending xact commit/abort
+	 * and connection stats get counted, even if there are no table stats to
+	 * send.
+	 */
+	if (regular_msg.m_nentries > 0 ||
+		pgStatXactCommit > 0 || pgStatXactRollback > 0 || disconnect)
+		pgstat_send_tabstat(&regular_msg, now);
+	if (shared_msg.m_nentries > 0)
+		pgstat_send_tabstat(&shared_msg, now);
+
+	have_relation_stats = false;
+}
+
+/*
+ * Subroutine for pgstat_send_tabstats: finish and send one tabstat message
+ */
+static void
+pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now)
+{
+	int			n;
+	int			len;
+
+	/* It's unlikely we'd get here with no socket, but maybe not impossible */
+	if (pgStatSock == PGINVALID_SOCKET)
+		return;
+
+	/*
+	 * Report and reset accumulated xact commit/rollback and I/O timings
+	 * whenever we send a normal tabstat message
+	 */
+	pgstat_update_dbstats(tsmsg, now);
+
+	n = tsmsg->m_nentries;
+	len = offsetof(PgStat_MsgTabstat, m_entry[0]) +
+		n * sizeof(PgStat_TableEntry);
+
+	pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT);
+	pgstat_send(tsmsg, len);
+}
+
+/*
+ * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel
+ */
+static PgStat_TableStatus *
+get_tabstat_entry(Oid rel_id, bool isshared)
+{
+	TabStatHashEntry *hash_entry;
+	PgStat_TableStatus *entry;
+	TabStatusArray *tsa;
+	bool		found;
+
+	pgstat_assert_is_up();
+
+	have_relation_stats = true;
+
+	/*
+	 * Create hash table if we don't have it already.
+	 */
+	if (pgStatTabHash == NULL)
+	{
+		HASHCTL		ctl;
+
+		ctl.keysize = sizeof(Oid);
+		ctl.entrysize = sizeof(TabStatHashEntry);
+
+		pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table",
+									TABSTAT_QUANTUM,
+									&ctl,
+									HASH_ELEM | HASH_BLOBS);
+	}
+
+	/*
+	 * Find an entry or create a new one.
+	 */
+	hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found);
+	if (!found)
+	{
+		/* initialize new entry with null pointer */
+		hash_entry->tsa_entry = NULL;
+	}
+
+	/*
+	 * If entry is already valid, we're done.
+	 */
+	if (hash_entry->tsa_entry)
+		return hash_entry->tsa_entry;
+
+	/*
+	 * Locate the first pgStatTabList entry with free space, making a new list
+	 * entry if needed.  Note that we could get an OOM failure here, but if so
+	 * we have left the hashtable and the list in a consistent state.
+	 */
+	if (pgStatTabList == NULL)
+	{
+		/* Set up first pgStatTabList entry */
+		pgStatTabList = (TabStatusArray *)
+			MemoryContextAllocZero(TopMemoryContext,
+								   sizeof(TabStatusArray));
+	}
+
+	tsa = pgStatTabList;
+	while (tsa->tsa_used >= TABSTAT_QUANTUM)
+	{
+		if (tsa->tsa_next == NULL)
+			tsa->tsa_next = (TabStatusArray *)
+				MemoryContextAllocZero(TopMemoryContext,
+									   sizeof(TabStatusArray));
+		tsa = tsa->tsa_next;
+	}
+
+	/*
+	 * Allocate a PgStat_TableStatus entry within this list entry.  We assume
+	 * the entry was already zeroed, either at creation or after last use.
+	 */
+	entry = &tsa->tsa_entries[tsa->tsa_used++];
+	entry->t_id = rel_id;
+	entry->t_shared = isshared;
+
+	/*
+	 * Now we can fill the entry in pgStatTabHash.
+	 */
+	hash_entry->tsa_entry = entry;
+
+	return entry;
+}
+
+/*
+ * add_tabstat_xact_level - add a new (sub)transaction state record
+ */
+static void
+add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level)
+{
+	PgStat_SubXactStatus *xact_state;
+	PgStat_TableXactStatus *trans;
+
+	/*
+	 * If this is the first rel to be modified at the current nest level, we
+	 * first have to push a transaction stack entry.
+	 */
+	xact_state = pgstat_xact_stack_level_get(nest_level);
+
+	/* Now make a per-table stack entry */
+	trans = (PgStat_TableXactStatus *)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(PgStat_TableXactStatus));
+	trans->nest_level = nest_level;
+	trans->upper = pgstat_info->trans;
+	trans->parent = pgstat_info;
+	trans->next = xact_state->first;
+	xact_state->first = trans;
+	pgstat_info->trans = trans;
+}
+
+/*
+ * Add a new (sub)transaction record if needed.
+ */
+static void
+ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info)
+{
+	int			nest_level = GetCurrentTransactionNestLevel();
+
+	if (pgstat_info->trans == NULL ||
+		pgstat_info->trans->nest_level != nest_level)
+		add_tabstat_xact_level(pgstat_info, nest_level);
+}
+
+/*
+ * pgstat_truncdrop_save_counters
+ *
+ * Whenever a table is truncated/dropped, we save its i/u/d counters so that
+ * they can be cleared, and if the (sub)xact that executed the truncate/drop
+ * later aborts, the counters can be restored to the saved (pre-truncate/drop)
+ * values.
+ *
+ * Note that for truncate we do this on the first truncate in any particular
+ * subxact level only.
+ */
+static void
+pgstat_truncdrop_save_counters(PgStat_TableXactStatus *trans, bool is_drop)
+{
+	if (!trans->truncdropped || is_drop)
+	{
+		trans->inserted_pre_truncdrop = trans->tuples_inserted;
+		trans->updated_pre_truncdrop = trans->tuples_updated;
+		trans->deleted_pre_truncdrop = trans->tuples_deleted;
+		trans->truncdropped = true;
+	}
+}
+
+/*
+ * pgstat_truncdrop_restore_counters - restore counters when a truncate aborts
+ */
+static void
+pgstat_truncdrop_restore_counters(PgStat_TableXactStatus *trans)
+{
+	if (trans->truncdropped)
+	{
+		trans->tuples_inserted = trans->inserted_pre_truncdrop;
+		trans->tuples_updated = trans->updated_pre_truncdrop;
+		trans->tuples_deleted = trans->deleted_pre_truncdrop;
+	}
+}