Fix logical decoding error when system table w/ toast is repeatedly rewritten.
authorAndres Freund <[email protected]>
Wed, 10 Oct 2018 20:53:03 +0000 (13:53 -0700)
committerAndres Freund <[email protected]>
Wed, 10 Oct 2018 20:53:03 +0000 (13:53 -0700)
Repeatedly rewriting a mapped catalog table with VACUUM FULL or
CLUSTER could cause logical decoding to fail with:
ERROR, "could not map filenode \"%s\" to relation OID"

To trigger the problem the rewritten catalog had to have live tuples
with toasted columns.

The problem was triggered as during catalog table rewrites the
heap_insert() check that prevents logical decoding information to be
emitted for system catalogs, failed to treat the new heap's toast table
as a system catalog (because the new heap is not recognized as a
catalog table via RelationIsLogicallyLogged()). The relmapper, in
contrast to the normal catalog contents, does not contain historical
information. After a single rewrite of a mapped table the new relation
is known to the relmapper, but if the table is rewritten twice before
logical decoding occurs, the relfilenode cannot be mapped to a
relation anymore.  Which then leads us to error out.   This only
happens for toast tables, because the main table contents aren't
re-inserted with heap_insert().

The fix is simple, add a new heap_insert() flag that prevents logical
decoding information from being emitted, and accept during decoding
that there might not be tuple data for toast tables.

Unfortunately that does not fix pre-existing logical decoding
errors. Doing so would require not throwing an error when a filenode
cannot be mapped to a relation during decoding, and that seems too
likely to hide bugs.  If it's crucial to fix decoding for an existing
slot, temporarily changing the ERROR in ReorderBufferCommit() to a
WARNING appears to be the best fix.

Author: Andres Freund
Discussion: https://postgr.es/m/20180914021046[email protected]
Backpatch: 9.4-, where logical decoding was introduced

contrib/test_decoding/expected/rewrite.out
contrib/test_decoding/sql/rewrite.sql
src/backend/access/heap/heapam.c
src/backend/access/heap/rewriteheap.c
src/backend/replication/logical/reorderbuffer.c
src/include/access/heapam.h

index 4dcd489543837d4460fd3f926ab22b18f9ee0307..3bf2afa9315f5a1cbc0f7d6f332aa93aeb201c73 100644 (file)
@@ -1,6 +1,61 @@
 -- predictability
 SET synchronous_commit = on;
 DROP TABLE IF EXISTS replication_example;
+-- Ensure there's tables with toast datums.  To do so, we dynamically
+-- create a function returning a large textblob.  We want tables of
+-- different kinds: mapped catalog table, unmapped catalog table,
+-- shared catalog table and usertable.
+CREATE FUNCTION exec(text) returns void language plpgsql volatile
+  AS $f$
+    BEGIN
+      EXECUTE $1;
+    END;
+$f$;
+CREATE ROLE justforcomments NOLOGIN;
+SELECT exec(
+    format($outer$CREATE FUNCTION iamalongfunction() RETURNS TEXT IMMUTABLE LANGUAGE SQL AS $f$SELECT text %L$f$$outer$,
+           (SELECT repeat(string_agg(to_char(g.i, 'FM0000'), ''), 50) FROM generate_series(1, 500) g(i))));
+ exec 
+------
+(1 row)
+
+SELECT exec(
+    format($outer$COMMENT ON FUNCTION iamalongfunction() IS %L$outer$,
+           iamalongfunction()));
+ exec 
+------
+(1 row)
+
+SELECT exec(
+    format($outer$COMMENT ON ROLE JUSTFORCOMMENTS IS %L$outer$,
+           iamalongfunction()));
+ exec 
+------
+(1 row)
+
+CREATE TABLE iamalargetable AS SELECT iamalongfunction() longfunctionoutput;
+-- verify toast usage
+SELECT pg_relation_size((SELECT reltoastrelid FROM pg_class WHERE oid = 'pg_proc'::regclass)) > 0;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT pg_relation_size((SELECT reltoastrelid FROM pg_class WHERE oid = 'pg_description'::regclass)) > 0;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT pg_relation_size((SELECT reltoastrelid FROM pg_class WHERE oid = 'pg_shdescription'::regclass)) > 0;
+ ?column? 
+----------
+ t
+(1 row)
+
 SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding');
  ?column? 
 ----------
@@ -76,6 +131,23 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc
  COMMIT
 (15 rows)
 
+-- trigger repeated rewrites of a system catalog with a toast table,
+-- that previously was buggy: [email protected]
+VACUUM FULL pg_proc; VACUUM FULL pg_description; VACUUM FULL pg_shdescription; VACUUM FULL iamalargetable;
+INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (8, 6, 1);
+VACUUM FULL pg_proc; VACUUM FULL pg_description; VACUUM FULL pg_shdescription; VACUUM FULL iamalargetable;
+INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (9, 7, 1);
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+                                                                                       data                                                                                        
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ BEGIN
+ table public.replication_example: INSERT: id[integer]:9 somedata[integer]:8 text[character varying]:null testcolumn1[integer]:6 testcolumn2[integer]:null testcolumn3[integer]:1
+ COMMIT
+ BEGIN
+ table public.replication_example: INSERT: id[integer]:10 somedata[integer]:9 text[character varying]:null testcolumn1[integer]:7 testcolumn2[integer]:null testcolumn3[integer]:1
+ COMMIT
+(6 rows)
+
 SELECT pg_drop_replication_slot('regression_slot');
  pg_drop_replication_slot 
 --------------------------
@@ -83,3 +155,6 @@ SELECT pg_drop_replication_slot('regression_slot');
 (1 row)
 
 DROP TABLE IF EXISTS replication_example;
+DROP FUNCTION iamalongfunction();
+DROP FUNCTION exec(text);
+DROP ROLE justforcomments;
index 8a7329423ded8bbf06044b95190e3b116f80806d..4271b82bead4b290357f4853afd2e85028b8a94b 100644 (file)
@@ -3,6 +3,35 @@ SET synchronous_commit = on;
 
 DROP TABLE IF EXISTS replication_example;
 
+-- Ensure there's tables with toast datums.  To do so, we dynamically
+-- create a function returning a large textblob.  We want tables of
+-- different kinds: mapped catalog table, unmapped catalog table,
+-- shared catalog table and usertable.
+CREATE FUNCTION exec(text) returns void language plpgsql volatile
+  AS $f$
+    BEGIN
+      EXECUTE $1;
+    END;
+$f$;
+CREATE ROLE justforcomments NOLOGIN;
+
+SELECT exec(
+    format($outer$CREATE FUNCTION iamalongfunction() RETURNS TEXT IMMUTABLE LANGUAGE SQL AS $f$SELECT text %L$f$$outer$,
+           (SELECT repeat(string_agg(to_char(g.i, 'FM0000'), ''), 50) FROM generate_series(1, 500) g(i))));
+SELECT exec(
+    format($outer$COMMENT ON FUNCTION iamalongfunction() IS %L$outer$,
+           iamalongfunction()));
+SELECT exec(
+    format($outer$COMMENT ON ROLE JUSTFORCOMMENTS IS %L$outer$,
+           iamalongfunction()));
+CREATE TABLE iamalargetable AS SELECT iamalongfunction() longfunctionoutput;
+
+-- verify toast usage
+SELECT pg_relation_size((SELECT reltoastrelid FROM pg_class WHERE oid = 'pg_proc'::regclass)) > 0;
+SELECT pg_relation_size((SELECT reltoastrelid FROM pg_class WHERE oid = 'pg_description'::regclass)) > 0;
+SELECT pg_relation_size((SELECT reltoastrelid FROM pg_class WHERE oid = 'pg_shdescription'::regclass)) > 0;
+
+
 SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding');
 CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));
 INSERT INTO replication_example(somedata) VALUES (1);
@@ -57,6 +86,17 @@ COMMIT;
 CHECKPOINT;
 
 SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
-SELECT pg_drop_replication_slot('regression_slot');
 
+-- trigger repeated rewrites of a system catalog with a toast table,
+-- that previously was buggy: [email protected]
+VACUUM FULL pg_proc; VACUUM FULL pg_description; VACUUM FULL pg_shdescription; VACUUM FULL iamalargetable;
+INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (8, 6, 1);
+VACUUM FULL pg_proc; VACUUM FULL pg_description; VACUUM FULL pg_shdescription; VACUUM FULL iamalargetable;
+INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (9, 7, 1);
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+
+SELECT pg_drop_replication_slot('regression_slot');
 DROP TABLE IF EXISTS replication_example;
+DROP FUNCTION iamalongfunction();
+DROP FUNCTION exec(text);
+DROP ROLE justforcomments;
index ed8dbae0d24badd163ae1156dc54b236fe51f25f..e873e1617e474eb817a6fd3fc4dde21dbf31bbeb 100644 (file)
@@ -2036,6 +2036,18 @@ FreeBulkInsertState(BulkInsertState bistate)
  * This causes rows to be frozen, which is an MVCC violation and
  * requires explicit options chosen by user.
  *
+ * HEAP_INSERT_SPECULATIVE is used on so-called "speculative insertions",
+ * which can be backed out afterwards without aborting the whole transaction.
+ * Other sessions can wait for the speculative insertion to be confirmed,
+ * turning it into a regular tuple, or aborted, as if it never existed.
+ * Speculatively inserted tuples behave as "value locks" of short duration,
+ * used to implement INSERT .. ON CONFLICT.
+ *
+ * HEAP_INSERT_NO_LOGICAL force-disables the emitting of logical decoding
+ * information for the tuple. This should solely be used during table rewrites
+ * where RelationIsLogicallyLogged(relation) is not yet accurate for the new
+ * relation.
+ *
  * Note that these options will be applied when inserting into the heap's
  * TOAST table, too, if the tuple requires any out-of-line data.
  *
@@ -2138,7 +2150,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
         * Also, if this is a catalog, we need to transmit combocids to
         * properly decode, so log that as well.
         */
-       need_tuple_data = RelationIsLogicallyLogged(relation);
+       need_tuple_data = RelationIsLogicallyLogged(relation) &&
+           !(options & HEAP_INSERT_NO_LOGICAL);
        if (RelationIsAccessibleInLogicalDecoding(relation))
            log_heap_new_cid(relation, heaptup);
 
@@ -2325,6 +2338,9 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
    bool        need_tuple_data = RelationIsLogicallyLogged(relation);
    bool        need_cids = RelationIsAccessibleInLogicalDecoding(relation);
 
+   /* currently not needed (thus unsupported) for heap_multi_insert() */
+   AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
+
    needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
    saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
                                                   HEAP_DEFAULT_FILLFACTOR);
index 66c4ea0c3e2fb04ed1d59fce51b83fc4aff08835..b330c6c651f81cc65dcb2a4256ac3b2f4f53d68e 100644 (file)
@@ -651,10 +651,23 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
        heaptup = tup;
    }
    else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
+   {
+       int options = HEAP_INSERT_SKIP_FSM;
+
+       if (!state->rs_use_wal)
+           options |= HEAP_INSERT_SKIP_WAL;
+
+       /*
+        * The new relfilenode's relcache entrye doesn't have the necessary
+        * information to determine whether a relation should emit data for
+        * logical decoding.  Force it to off if necessary.
+        */
+       if (!RelationIsLogicallyLogged(state->rs_old_rel))
+           options |= HEAP_INSERT_NO_LOGICAL;
+
        heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL,
-                                        HEAP_INSERT_SKIP_FSM |
-                                        (state->rs_use_wal ?
-                                         0 : HEAP_INSERT_SKIP_WAL));
+                                        options);
+   }
    else
        heaptup = tup;
 
index db7ddac0a2ebab68a121127e6326fd019c45f273..42d448162418a3464dba32bebfe1bcac28fc848c 100644 (file)
@@ -1509,8 +1509,16 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
                                            change->data.tp.relnode.relNode);
 
                    /*
-                    * Catalog tuple without data, emitted while catalog was
-                    * in the process of being rewritten.
+                    * Mapped catalog tuple without data, emitted while
+                    * catalog table was in the process of being rewritten. We
+                    * can fail to look up the relfilenode, because the the
+                    * relmapper has no "historic" view, in contrast to normal
+                    * the normal catalog during decoding. Thus repeated
+                    * rewrites can cause a lookup failure. That's OK because
+                    * we do not decode catalog changes anyway. Normally such
+                    * tuples would be skipped over below, but we can't
+                    * identify whether the table should be logically logged
+                    * without mapping the relfilenode to the oid.
                     */
                    if (reloid == InvalidOid &&
                        change->data.tp.newtuple == NULL &&
@@ -1564,10 +1572,17 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
                             * transaction's changes. Otherwise it will get
                             * freed/reused while restoring spooled data from
                             * disk.
+                            *
+                            * But skip doing so if there's no
+                            * tuple-data. That happens if a non-mapped system
+                            * catalog with a toast table is rewritten.
                             */
-                           dlist_delete(&change->node);
-                           ReorderBufferToastAppendChunk(rb, txn, relation,
-                                                         change);
+                           if (change->data.tp.newtuple != NULL)
+                           {
+                               dlist_delete(&change->node);
+                               ReorderBufferToastAppendChunk(rb, txn, relation,
+                                                             change);
+                           }
                        }
 
                    }
index c478ccc1d691d0b6e9a2587d657ec4d0bdad1422..6c03950a2e679066e05997a73a4b3a5b1f2b0dbc 100644 (file)
@@ -27,6 +27,8 @@
 #define HEAP_INSERT_SKIP_WAL   0x0001
 #define HEAP_INSERT_SKIP_FSM   0x0002
 #define HEAP_INSERT_FROZEN     0x0004
+/* gap, to keep NO_LOGICAL in sync w/ newer branches */
+#define HEAP_INSERT_NO_LOGICAL 0x0010
 
 typedef struct BulkInsertStateData *BulkInsertState;