diff options
author | Alvaro Herrera | 2022-03-20 17:43:40 +0000 |
---|---|---|
committer | Alvaro Herrera | 2022-03-20 17:43:40 +0000 |
commit | ba9a7e392171c83eb3332a757279e7088487f9a2 (patch) | |
tree | bb30fa47d030733e048c3ff77d21b6803bce941c /src/backend | |
parent | 3f513ac7935db86f72511aac24fa6b52ed29bfe7 (diff) |
Enforce foreign key correctly during cross-partition updates
When an update on a partitioned table referenced in foreign key
constraints causes a row to move from one partition to another,
the fact that the move is implemented as a delete followed by an insert
on the target partition causes the foreign key triggers to have
surprising behavior. For example, a given foreign key's delete trigger
which implements the ON DELETE CASCADE clause of that key will delete
any referencing rows when triggered for that internal DELETE, although
it should not, because the referenced row is simply being moved from one
partition of the referenced root partitioned table into another, not
being deleted from it.
This commit teaches trigger.c to skip queuing such delete trigger events
on the leaf partitions in favor of an UPDATE event fired on the root
target relation. Doing so is sensible because both the old and the new
tuple "logically" belong to the root relation.
The after trigger event queuing interface now allows passing the source
and the target partitions of a particular cross-partition update when
registering the update event for the root partitioned table. Along with
the two ctids of the old and the new tuple, the after trigger event now
also stores the OIDs of those partitions. The tuples fetched from the
source and the target partitions are converted into the root table
format, if necessary, before they are passed to the trigger function.
The implementation currently has a limitation that only the foreign keys
pointing into the query's target relation are considered, not those of
its sub-partitioned partitions. That seems like a reasonable
limitation, because it sounds rare to have distinct foreign keys
pointing to sub-partitioned partitions instead of to the root table.
This misbehavior stems from commit f56f8f8da6af (which added support for
foreign keys to reference partitioned tables) not paying sufficient
attention to commit 2f178441044b (which had introduced cross-partition
updates a year earlier). Even though the former commit goes back to
Postgres 12, we're not backpatching this fix at this time for fear of
destabilizing things too much, and because there are a few ABI breaks in
it that we'd have to work around in older branches. It also depends on
commit f4566345cf40, which had its own share of backpatchability issues
as well.
Author: Amit Langote <[email protected]>
Reviewed-by: Masahiko Sawada <[email protected]>
Reviewed-by: Álvaro Herrera <[email protected]>
Reported-by: Eduard Català <[email protected]>
Discussion: https://postgr.es/m/CA+HiwqFvkBCmfwkQX_yBqv2Wz8ugUGiBDxum8=WvVbfU1TXaNg@mail.gmail.com
Discussion: https://postgr.es/m/CAL54xNZsLwEM1XCk5yW9EqaRzsZYHuWsHQkA2L5MOSKXAwviCQ@mail.gmail.com
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/commands/trigger.c | 394 | ||||
-rw-r--r-- | src/backend/executor/execMain.c | 86 | ||||
-rw-r--r-- | src/backend/executor/execReplication.c | 5 | ||||
-rw-r--r-- | src/backend/executor/nodeModifyTable.c | 151 | ||||
-rw-r--r-- | src/backend/utils/adt/ri_triggers.c | 6 |
5 files changed, 569 insertions, 73 deletions
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index e08bd9a370f..fce79b02a57 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -95,10 +95,13 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata, Instrumentation *instr, MemoryContext per_tuple_context); static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, + ResultRelInfo *src_partinfo, + ResultRelInfo *dst_partinfo, int event, bool row_trigger, TupleTableSlot *oldtup, TupleTableSlot *newtup, List *recheckIndexes, Bitmapset *modifiedCols, - TransitionCaptureState *transition_capture); + TransitionCaptureState *transition_capture, + bool is_crosspart_update); static void AfterTriggerEnlargeQueryState(void); static bool before_stmt_triggers_fired(Oid relid, CmdType cmdType); @@ -2458,8 +2461,10 @@ ExecASInsertTriggers(EState *estate, ResultRelInfo *relinfo, TriggerDesc *trigdesc = relinfo->ri_TrigDesc; if (trigdesc && trigdesc->trig_insert_after_statement) - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_INSERT, - false, NULL, NULL, NIL, NULL, transition_capture); + AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, + TRIGGER_EVENT_INSERT, + false, NULL, NULL, NIL, NULL, transition_capture, + false); } bool @@ -2547,10 +2552,12 @@ ExecARInsertTriggers(EState *estate, ResultRelInfo *relinfo, if ((trigdesc && trigdesc->trig_insert_after_row) || (transition_capture && transition_capture->tcs_insert_new_table)) - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_INSERT, + AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, + TRIGGER_EVENT_INSERT, true, NULL, slot, recheckIndexes, NULL, - transition_capture); + transition_capture, + false); } bool @@ -2672,8 +2679,10 @@ ExecASDeleteTriggers(EState *estate, ResultRelInfo *relinfo, TriggerDesc *trigdesc = relinfo->ri_TrigDesc; if (trigdesc && trigdesc->trig_delete_after_statement) - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_DELETE, - false, NULL, NULL, NIL, NULL, transition_capture); + AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, + TRIGGER_EVENT_DELETE, + false, NULL, NULL, NIL, NULL, transition_capture, + false); } /* @@ -2768,11 +2777,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, return result; } +/* + * Note: is_crosspart_update must be true if the DELETE is being performed + * as part of a cross-partition update. + */ void -ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, +ExecARDeleteTriggers(EState *estate, + ResultRelInfo *relinfo, ItemPointer tupleid, HeapTuple fdw_trigtuple, - TransitionCaptureState *transition_capture) + TransitionCaptureState *transition_capture, + bool is_crosspart_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2793,9 +2808,11 @@ ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, else ExecForceStoreHeapTuple(fdw_trigtuple, slot, false); - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_DELETE, + AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, + TRIGGER_EVENT_DELETE, true, slot, NULL, NIL, NULL, - transition_capture); + transition_capture, + is_crosspart_update); } } @@ -2914,10 +2931,12 @@ ExecASUpdateTriggers(EState *estate, ResultRelInfo *relinfo, Assert(relinfo->ri_RootResultRelInfo == NULL); if (trigdesc && trigdesc->trig_update_after_statement) - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_UPDATE, + AfterTriggerSaveEvent(estate, relinfo, NULL, NULL, + TRIGGER_EVENT_UPDATE, false, NULL, NULL, NIL, ExecGetAllUpdatedCols(relinfo, estate), - transition_capture); + transition_capture, + false); } bool @@ -3052,13 +3071,26 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, return true; } +/* + * Note: 'src_partinfo' and 'dst_partinfo', when non-NULL, refer to the source + * and destination partitions, respectively, of a cross-partition update of + * the root partitioned table mentioned in the query, given by 'relinfo'. + * 'tupleid' in that case refers to the ctid of the "old" tuple in the source + * partition, and 'newslot' contains the "new" tuple in the destination + * partition. This interface allows to support the requirements of + * ExecCrossPartitionUpdateForeignKey(); is_crosspart_update must be true in + * that case. + */ void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, + ResultRelInfo *src_partinfo, + ResultRelInfo *dst_partinfo, ItemPointer tupleid, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, List *recheckIndexes, - TransitionCaptureState *transition_capture) + TransitionCaptureState *transition_capture, + bool is_crosspart_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -3073,12 +3105,19 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, * separately for DELETE and INSERT to capture transition table rows. * In such case, either old tuple or new tuple can be NULL. */ - TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); + TupleTableSlot *oldslot; + ResultRelInfo *tupsrc; + + Assert((src_partinfo != NULL && dst_partinfo != NULL) || + !is_crosspart_update); + + tupsrc = src_partinfo ? src_partinfo : relinfo; + oldslot = ExecGetTriggerOldSlot(estate, tupsrc); if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid)) GetTupleForTrigger(estate, NULL, - relinfo, + tupsrc, tupleid, LockTupleExclusive, oldslot, @@ -3088,10 +3127,14 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, else ExecClearTuple(oldslot); - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_UPDATE, - true, oldslot, newslot, recheckIndexes, + AfterTriggerSaveEvent(estate, relinfo, + src_partinfo, dst_partinfo, + TRIGGER_EVENT_UPDATE, + true, + oldslot, newslot, recheckIndexes, ExecGetAllUpdatedCols(relinfo, estate), - transition_capture); + transition_capture, + is_crosspart_update); } } @@ -3214,8 +3257,11 @@ ExecASTruncateTriggers(EState *estate, ResultRelInfo *relinfo) TriggerDesc *trigdesc = relinfo->ri_TrigDesc; if (trigdesc && trigdesc->trig_truncate_after_statement) - AfterTriggerSaveEvent(estate, relinfo, TRIGGER_EVENT_TRUNCATE, - false, NULL, NULL, NIL, NULL, NULL); + AfterTriggerSaveEvent(estate, relinfo, + NULL, NULL, + TRIGGER_EVENT_TRUNCATE, + false, NULL, NULL, NIL, NULL, NULL, + false); } @@ -3496,9 +3542,9 @@ typedef SetConstraintStateData *SetConstraintState; * Per-trigger-event data * * The actual per-event data, AfterTriggerEventData, includes DONE/IN_PROGRESS - * status bits and up to two tuple CTIDs. Each event record also has an - * associated AfterTriggerSharedData that is shared across all instances of - * similar events within a "chunk". + * status bits, up to two tuple CTIDs, and optionally two OIDs of partitions. + * Each event record also has an associated AfterTriggerSharedData that is + * shared across all instances of similar events within a "chunk". * * For row-level triggers, we arrange not to waste storage on unneeded ctid * fields. Updates of regular tables use two; inserts and deletes of regular @@ -3509,6 +3555,11 @@ typedef SetConstraintStateData *SetConstraintState; * tuple(s). This permits storing tuples once regardless of the number of * row-level triggers on a foreign table. * + * When updates on partitioned tables cause rows to move between partitions, + * the OIDs of both partitions are stored too, so that the tuples can be + * fetched; such entries are marked AFTER_TRIGGER_CP_UPDATE (for "cross- + * partition update"). + * * Note that we need triggers on foreign tables to be fired in exactly the * order they were queued, so that the tuples come out of the tuplestore in * the right order. To ensure that, we forbid deferrable (constraint) @@ -3531,16 +3582,16 @@ typedef SetConstraintStateData *SetConstraintState; */ typedef uint32 TriggerFlags; -#define AFTER_TRIGGER_OFFSET 0x0FFFFFFF /* must be low-order bits */ -#define AFTER_TRIGGER_DONE 0x10000000 -#define AFTER_TRIGGER_IN_PROGRESS 0x20000000 +#define AFTER_TRIGGER_OFFSET 0x07FFFFFF /* must be low-order bits */ +#define AFTER_TRIGGER_DONE 0x80000000 +#define AFTER_TRIGGER_IN_PROGRESS 0x40000000 /* bits describing the size and tuple sources of this event */ #define AFTER_TRIGGER_FDW_REUSE 0x00000000 -#define AFTER_TRIGGER_FDW_FETCH 0x80000000 -#define AFTER_TRIGGER_1CTID 0x40000000 -#define AFTER_TRIGGER_2CTID 0xC0000000 -#define AFTER_TRIGGER_TUP_BITS 0xC0000000 - +#define AFTER_TRIGGER_FDW_FETCH 0x20000000 +#define AFTER_TRIGGER_1CTID 0x10000000 +#define AFTER_TRIGGER_2CTID 0x30000000 +#define AFTER_TRIGGER_CP_UPDATE 0x08000000 +#define AFTER_TRIGGER_TUP_BITS 0x38000000 typedef struct AfterTriggerSharedData *AfterTriggerShared; typedef struct AfterTriggerSharedData @@ -3560,27 +3611,45 @@ typedef struct AfterTriggerEventData TriggerFlags ate_flags; /* status bits and offset to shared data */ ItemPointerData ate_ctid1; /* inserted, deleted, or old updated tuple */ ItemPointerData ate_ctid2; /* new updated tuple */ + + /* + * During a cross-partition update of a partitioned table, we also store + * the OIDs of source and destination partitions that are needed to fetch + * the old (ctid1) and the new tuple (ctid2) from, respectively. + */ + Oid ate_src_part; + Oid ate_dst_part; } AfterTriggerEventData; -/* AfterTriggerEventData, minus ate_ctid2 */ +/* AfterTriggerEventData, minus ate_src_part, ate_dst_part */ +typedef struct AfterTriggerEventDataNoOids +{ + TriggerFlags ate_flags; + ItemPointerData ate_ctid1; + ItemPointerData ate_ctid2; +} AfterTriggerEventDataNoOids; + +/* AfterTriggerEventData, minus ate_*_part and ate_ctid2 */ typedef struct AfterTriggerEventDataOneCtid { TriggerFlags ate_flags; /* status bits and offset to shared data */ ItemPointerData ate_ctid1; /* inserted, deleted, or old updated tuple */ } AfterTriggerEventDataOneCtid; -/* AfterTriggerEventData, minus ate_ctid1 and ate_ctid2 */ +/* AfterTriggerEventData, minus ate_*_part, ate_ctid1 and ate_ctid2 */ typedef struct AfterTriggerEventDataZeroCtids { TriggerFlags ate_flags; /* status bits and offset to shared data */ } AfterTriggerEventDataZeroCtids; #define SizeofTriggerEvent(evt) \ - (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID ? \ + (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_CP_UPDATE ? \ sizeof(AfterTriggerEventData) : \ - ((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_1CTID ? \ - sizeof(AfterTriggerEventDataOneCtid) : \ - sizeof(AfterTriggerEventDataZeroCtids)) + (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID ? \ + sizeof(AfterTriggerEventDataNoOids) : \ + (((evt)->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_1CTID ? \ + sizeof(AfterTriggerEventDataOneCtid) : \ + sizeof(AfterTriggerEventDataZeroCtids)))) #define GetTriggerSharedData(evt) \ ((AfterTriggerShared) ((char *) (evt) + ((evt)->ate_flags & AFTER_TRIGGER_OFFSET))) @@ -3762,6 +3831,8 @@ static AfterTriggersData afterTriggers; static void AfterTriggerExecute(EState *estate, AfterTriggerEvent event, ResultRelInfo *relInfo, + ResultRelInfo *src_relInfo, + ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, Instrumentation *instr, @@ -4096,8 +4167,16 @@ afterTriggerDeleteHeadEventChunk(AfterTriggersQueryData *qs) * fmgr lookup cache space at the caller level. (For triggers fired at * the end of a query, we can even piggyback on the executor's state.) * + * When fired for a cross-partition update of a partitioned table, the old + * tuple is fetched using 'src_relInfo' (the source leaf partition) and + * the new tuple using 'dst_relInfo' (the destination leaf partition), though + * both are converted into the root partitioned table's format before passing + * to the trigger function. + * * event: event currently being fired. - * rel: open relation for event. + * relInfo: result relation for event. + * src_relInfo: source partition of a cross-partition update + * dst_relInfo: its destination partition * trigdesc: working copy of rel's trigger info. * finfo: array of fmgr lookup cache entries (one per trigger in trigdesc). * instr: array of EXPLAIN ANALYZE instrumentation nodes (one per trigger), @@ -4111,6 +4190,8 @@ static void AfterTriggerExecute(EState *estate, AfterTriggerEvent event, ResultRelInfo *relInfo, + ResultRelInfo *src_relInfo, + ResultRelInfo *dst_relInfo, TriggerDesc *trigdesc, FmgrInfo *finfo, Instrumentation *instr, MemoryContext per_tuple_context, @@ -4118,6 +4199,8 @@ AfterTriggerExecute(EState *estate, TupleTableSlot *trig_tuple_slot2) { Relation rel = relInfo->ri_RelationDesc; + Relation src_rel = src_relInfo->ri_RelationDesc; + Relation dst_rel = dst_relInfo->ri_RelationDesc; AfterTriggerShared evtshared = GetTriggerSharedData(event); Oid tgoid = evtshared->ats_tgoid; TriggerData LocTriggerData = {0}; @@ -4198,12 +4281,35 @@ AfterTriggerExecute(EState *estate, default: if (ItemPointerIsValid(&(event->ate_ctid1))) { - LocTriggerData.tg_trigslot = ExecGetTriggerOldSlot(estate, relInfo); + TupleTableSlot *src_slot = ExecGetTriggerOldSlot(estate, + src_relInfo); - if (!table_tuple_fetch_row_version(rel, &(event->ate_ctid1), + if (!table_tuple_fetch_row_version(src_rel, + &(event->ate_ctid1), SnapshotAny, - LocTriggerData.tg_trigslot)) + src_slot)) elog(ERROR, "failed to fetch tuple1 for AFTER trigger"); + + /* + * Store the tuple fetched from the source partition into the + * target (root partitioned) table slot, converting if needed. + */ + if (src_relInfo != relInfo) + { + TupleConversionMap *map = ExecGetChildToRootMap(src_relInfo); + + LocTriggerData.tg_trigslot = ExecGetTriggerOldSlot(estate, relInfo); + if (map) + { + execute_attr_map_slot(map->attrMap, + src_slot, + LocTriggerData.tg_trigslot); + } + else + ExecCopySlot(LocTriggerData.tg_trigslot, src_slot); + } + else + LocTriggerData.tg_trigslot = src_slot; LocTriggerData.tg_trigtuple = ExecFetchSlotHeapTuple(LocTriggerData.tg_trigslot, false, &should_free_trig); } @@ -4213,16 +4319,40 @@ AfterTriggerExecute(EState *estate, } /* don't touch ctid2 if not there */ - if ((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == - AFTER_TRIGGER_2CTID && + if (((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == AFTER_TRIGGER_2CTID || + (event->ate_flags & AFTER_TRIGGER_CP_UPDATE)) && ItemPointerIsValid(&(event->ate_ctid2))) { - LocTriggerData.tg_newslot = ExecGetTriggerNewSlot(estate, relInfo); + TupleTableSlot *dst_slot = ExecGetTriggerNewSlot(estate, + dst_relInfo); - if (!table_tuple_fetch_row_version(rel, &(event->ate_ctid2), + if (!table_tuple_fetch_row_version(dst_rel, + &(event->ate_ctid2), SnapshotAny, - LocTriggerData.tg_newslot)) + dst_slot)) elog(ERROR, "failed to fetch tuple2 for AFTER trigger"); + + /* + * Store the tuple fetched from the destination partition into + * the target (root partitioned) table slot, converting if + * needed. + */ + if (dst_relInfo != relInfo) + { + TupleConversionMap *map = ExecGetChildToRootMap(dst_relInfo); + + LocTriggerData.tg_newslot = ExecGetTriggerNewSlot(estate, relInfo); + if (map) + { + execute_attr_map_slot(map->attrMap, + dst_slot, + LocTriggerData.tg_newslot); + } + else + ExecCopySlot(LocTriggerData.tg_newslot, dst_slot); + } + else + LocTriggerData.tg_newslot = dst_slot; LocTriggerData.tg_newtuple = ExecFetchSlotHeapTuple(LocTriggerData.tg_newslot, false, &should_free_new); } @@ -4451,13 +4581,17 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, if ((event->ate_flags & AFTER_TRIGGER_IN_PROGRESS) && evtshared->ats_firing_id == firing_id) { + ResultRelInfo *src_rInfo, + *dst_rInfo; + /* * So let's fire it... but first, find the correct relation if * this is not the same relation as before. */ if (rel == NULL || RelationGetRelid(rel) != evtshared->ats_relid) { - rInfo = ExecGetTriggerResultRel(estate, evtshared->ats_relid); + rInfo = ExecGetTriggerResultRel(estate, evtshared->ats_relid, + NULL); rel = rInfo->ri_RelationDesc; /* Catch calls with insufficient relcache refcounting */ Assert(!RelationHasReferenceCountZero(rel)); @@ -4483,11 +4617,32 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events, } /* + * Look up source and destination partition result rels of a + * cross-partition update event. + */ + if ((event->ate_flags & AFTER_TRIGGER_TUP_BITS) == + AFTER_TRIGGER_CP_UPDATE) + { + Assert(OidIsValid(event->ate_src_part) && + OidIsValid(event->ate_dst_part)); + src_rInfo = ExecGetTriggerResultRel(estate, + event->ate_src_part, + rInfo); + dst_rInfo = ExecGetTriggerResultRel(estate, + event->ate_dst_part, + rInfo); + } + else + src_rInfo = dst_rInfo = rInfo; + + /* * Fire it. Note that the AFTER_TRIGGER_IN_PROGRESS flag is * still set, so recursive examinations of the event list * won't try to re-fire it. */ - AfterTriggerExecute(estate, event, rInfo, trigdesc, finfo, instr, + AfterTriggerExecute(estate, event, rInfo, + src_rInfo, dst_rInfo, + trigdesc, finfo, instr, per_tuple_context, slot1, slot2); /* @@ -5767,14 +5922,35 @@ AfterTriggerPendingOnRel(Oid relid) * Transition tuplestores are built now, rather than when events are pulled * off of the queue because AFTER ROW triggers are allowed to select from the * transition tables for the statement. + * + * This contains special support to queue the update events for the case where + * a partitioned table undergoing a cross-partition update may have foreign + * keys pointing into it. Normally, a partitioned table's row triggers are + * not fired because the leaf partition(s) which are modified as a result of + * the operation on the partitioned table contain the same triggers which are + * fired instead. But that general scheme can cause problematic behavior with + * foreign key triggers during cross-partition updates, which are implemented + * as DELETE on the source partition followed by INSERT into the destination + * partition. Specifically, firing DELETE triggers would lead to the wrong + * foreign key action to be enforced considering that the original command is + * UPDATE; in this case, this function is called with relinfo as the + * partitioned table, and src_partinfo and dst_partinfo referring to the + * source and target leaf partitions, respectively. + * + * is_crosspart_update is true either when a DELETE event is fired on the + * source partition (which is to be ignored) or an UPDATE event is fired on + * the root partitioned table. * ---------- */ static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, + ResultRelInfo *src_partinfo, + ResultRelInfo *dst_partinfo, int event, bool row_trigger, TupleTableSlot *oldslot, TupleTableSlot *newslot, List *recheckIndexes, Bitmapset *modifiedCols, - TransitionCaptureState *transition_capture) + TransitionCaptureState *transition_capture, + bool is_crosspart_update) { Relation rel = relinfo->ri_RelationDesc; TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -5855,6 +6031,19 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, } /* + * We normally don't see partitioned tables here for row level triggers + * except in the special case of a cross-partition update. In that case, + * nodeModifyTable.c:ExecCrossPartitionUpdateForeignKey() calls here to + * queue an update event on the root target partitioned table, also + * passing the source and destination partitions and their tuples. + */ + Assert(!row_trigger || + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE || + (is_crosspart_update && + TRIGGER_FIRED_BY_UPDATE(event) && + src_partinfo != NULL && dst_partinfo != NULL)); + + /* * Validate the event code and collect the associated tuple CTIDs. * * The event code will be used both as a bitmask and an array offset, so @@ -5914,6 +6103,19 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, Assert(newslot != NULL); ItemPointerCopy(&(oldslot->tts_tid), &(new_event.ate_ctid1)); ItemPointerCopy(&(newslot->tts_tid), &(new_event.ate_ctid2)); + + /* + * Also remember the OIDs of partitions to fetch these tuples + * out of later in AfterTriggerExecute(). + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + Assert(src_partinfo != NULL && dst_partinfo != NULL); + new_event.ate_src_part = + RelationGetRelid(src_partinfo->ri_RelationDesc); + new_event.ate_dst_part = + RelationGetRelid(dst_partinfo->ri_RelationDesc); + } } else { @@ -5938,13 +6140,53 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; } + /* Determine flags */ if (!(relkind == RELKIND_FOREIGN_TABLE && row_trigger)) - new_event.ate_flags = (row_trigger && event == TRIGGER_EVENT_UPDATE) ? - AFTER_TRIGGER_2CTID : AFTER_TRIGGER_1CTID; + { + if (row_trigger && event == TRIGGER_EVENT_UPDATE) + { + if (relkind == RELKIND_PARTITIONED_TABLE) + new_event.ate_flags = AFTER_TRIGGER_CP_UPDATE; + else + new_event.ate_flags = AFTER_TRIGGER_2CTID; + } + else + new_event.ate_flags = AFTER_TRIGGER_1CTID; + } + /* else, we'll initialize ate_flags for each trigger */ tgtype_level = (row_trigger ? TRIGGER_TYPE_ROW : TRIGGER_TYPE_STATEMENT); + /* + * Must convert/copy the source and destination partition tuples into the + * root partitioned table's format/slot, because the processing in the + * loop below expects both oldslot and newslot tuples to be in that form. + */ + if (row_trigger && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + { + TupleTableSlot *rootslot; + TupleConversionMap *map; + + rootslot = ExecGetTriggerOldSlot(estate, relinfo); + map = ExecGetChildToRootMap(src_partinfo); + if (map) + oldslot = execute_attr_map_slot(map->attrMap, + oldslot, + rootslot); + else + oldslot = ExecCopySlot(rootslot, oldslot); + + rootslot = ExecGetTriggerNewSlot(estate, relinfo); + map = ExecGetChildToRootMap(dst_partinfo); + if (map) + newslot = execute_attr_map_slot(map->attrMap, + newslot, + rootslot); + else + newslot = ExecCopySlot(rootslot, newslot); + } + for (i = 0; i < trigdesc->numtriggers; i++) { Trigger *trigger = &trigdesc->triggers[i]; @@ -5973,13 +6215,30 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, /* * If the trigger is a foreign key enforcement trigger, there are * certain cases where we can skip queueing the event because we can - * tell by inspection that the FK constraint will still pass. + * tell by inspection that the FK constraint will still pass. There + * are also some cases during cross-partition updates of a partitioned + * table where queuing the event can be skipped. */ if (TRIGGER_FIRED_BY_UPDATE(event) || TRIGGER_FIRED_BY_DELETE(event)) { switch (RI_FKey_trigger_type(trigger->tgfoid)) { case RI_TRIGGER_PK: + + /* + * For cross-partitioned updates of partitioned PK table, + * skip the event fired by the component delete on the + * source leaf partition unless the constraint originates + * in the partition itself (!tgisclone), because the + * update event that will be fired on the root + * (partitioned) target table will be used to perform the + * necessary foreign key enforcement action. + */ + if (is_crosspart_update && + TRIGGER_FIRED_BY_DELETE(event) && + trigger->tgisclone) + continue; + /* Update or delete on trigger's PK table */ if (!RI_FKey_pk_upd_check_required(trigger, rel, oldslot, newslot)) @@ -5990,8 +6249,20 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; case RI_TRIGGER_FK: - /* Update on trigger's FK table */ - if (!RI_FKey_fk_upd_check_required(trigger, rel, + + /* + * Update on trigger's FK table. We can skip the update + * event fired on a partitioned table during a + * cross-partition of that table, because the insert event + * that is fired on the destination leaf partition would + * suffice to perform the necessary foreign key check. + * Moreover, RI_FKey_fk_upd_check_required() expects to be + * passed a tuple that contains system attributes, most of + * which are not present in the virtual slot belonging to + * a partitioned table. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || + !RI_FKey_fk_upd_check_required(trigger, rel, oldslot, newslot)) { /* skip queuing this event */ @@ -6000,7 +6271,18 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo, break; case RI_TRIGGER_NONE: - /* Not an FK trigger */ + + /* + * Not an FK trigger. No need to queue the update event + * fired during a cross-partitioned update of a + * partitioned table, because the same row trigger must be + * present in the leaf partition(s) that are affected as + * part of this update and the events fired on them are + * queued instead. + */ + if (row_trigger && + rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + continue; break; } } diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 549d9eb6963..473d2e00a2f 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -44,6 +44,7 @@ #include "access/transam.h" #include "access/xact.h" #include "catalog/namespace.h" +#include "catalog/partition.h" #include "catalog/pg_publication.h" #include "commands/matview.h" #include "commands/trigger.h" @@ -1279,7 +1280,8 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo, * in es_trig_target_relations. */ ResultRelInfo * -ExecGetTriggerResultRel(EState *estate, Oid relid) +ExecGetTriggerResultRel(EState *estate, Oid relid, + ResultRelInfo *rootRelInfo) { ResultRelInfo *rInfo; ListCell *l; @@ -1330,7 +1332,7 @@ ExecGetTriggerResultRel(EState *estate, Oid relid) InitResultRelInfo(rInfo, rel, 0, /* dummy rangetable index */ - NULL, + rootRelInfo, estate->es_instrument); estate->es_trig_target_relations = lappend(estate->es_trig_target_relations, rInfo); @@ -1344,6 +1346,69 @@ ExecGetTriggerResultRel(EState *estate, Oid relid) return rInfo; } +/* + * Return the ancestor relations of a given leaf partition result relation + * up to and including the query's root target relation. + * + * These work much like the ones opened by ExecGetTriggerResultRel, except + * that we need to keep them in a separate list. + * + * These are closed by ExecCloseResultRelations. + */ +List * +ExecGetAncestorResultRels(EState *estate, ResultRelInfo *resultRelInfo) +{ + ResultRelInfo *rootRelInfo = resultRelInfo->ri_RootResultRelInfo; + Relation partRel = resultRelInfo->ri_RelationDesc; + Oid rootRelOid; + + if (!partRel->rd_rel->relispartition) + elog(ERROR, "cannot find ancestors of a non-partition result relation"); + Assert(rootRelInfo != NULL); + rootRelOid = RelationGetRelid(rootRelInfo->ri_RelationDesc); + if (resultRelInfo->ri_ancestorResultRels == NIL) + { + ListCell *lc; + List *oids = get_partition_ancestors(RelationGetRelid(partRel)); + List *ancResultRels = NIL; + + foreach(lc, oids) + { + Oid ancOid = lfirst_oid(lc); + Relation ancRel; + ResultRelInfo *rInfo; + + /* + * Ignore the root ancestor here, and use ri_RootResultRelInfo + * (below) for it instead. Also, we stop climbing up the + * hierarchy when we find the table that was mentioned in the + * query. + */ + if (ancOid == rootRelOid) + break; + + /* + * All ancestors up to the root target relation must have been + * locked by the planner or AcquireExecutorLocks(). + */ + ancRel = table_open(ancOid, NoLock); + rInfo = makeNode(ResultRelInfo); + + /* dummy rangetable index */ + InitResultRelInfo(rInfo, ancRel, 0, NULL, + estate->es_instrument); + ancResultRels = lappend(ancResultRels, rInfo); + } + ancResultRels = lappend(ancResultRels, rootRelInfo); + resultRelInfo->ri_ancestorResultRels = ancResultRels; + } + + /* We must have found some ancestor */ + Assert(resultRelInfo->ri_ancestorResultRels != NIL); + + return resultRelInfo->ri_ancestorResultRels; +} + /* ---------------------------------------------------------------- * ExecPostprocessPlan * @@ -1443,12 +1508,29 @@ ExecCloseResultRelations(EState *estate) /* * close indexes of result relation(s) if any. (Rels themselves are * closed in ExecCloseRangeTableRelations()) + * + * In addition, close the stub RTs that may be in each resultrel's + * ri_ancestorResultRels. */ foreach(l, estate->es_opened_result_relations) { ResultRelInfo *resultRelInfo = lfirst(l); + ListCell *lc; ExecCloseIndices(resultRelInfo); + foreach(lc, resultRelInfo->ri_ancestorResultRels) + { + ResultRelInfo *rInfo = lfirst(lc); + + /* + * Ancestors with RTI > 0 (should only be the root ancestor) are + * closed by ExecCloseRangeTableRelations. + */ + if (rInfo->ri_RangeTableIndex > 0) + continue; + + table_close(rInfo->ri_RelationDesc, NoLock); + } } /* Close any relations that have been opened by ExecGetTriggerResultRel(). */ diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 09f78f22441..13328141e23 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -517,8 +517,9 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, + NULL, NULL, tid, NULL, slot, - recheckIndexes, NULL); + recheckIndexes, NULL, false); list_free(recheckIndexes); } @@ -557,7 +558,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, - tid, NULL, NULL); + tid, NULL, NULL, false); } } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index babf26810b7..5e4226abe26 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -122,6 +122,12 @@ static void ExecBatchInsert(ModifyTableState *mtstate, int numSlots, EState *estate, bool canSetTag); +static void ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, + ResultRelInfo *sourcePartInfo, + ResultRelInfo *destPartInfo, + ItemPointer tupleid, + TupleTableSlot *oldslot, + TupleTableSlot *newslot); static bool ExecOnConflictUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer conflictTid, @@ -635,6 +641,9 @@ ExecGetUpdateNewTuple(ResultRelInfo *relinfo, * slot contains the new tuple value to be stored. * * Returns RETURNING result if any, otherwise NULL. + * *inserted_tuple is the tuple that's effectively inserted; + * *inserted_destrel is the relation where it was inserted. + * These are only set on success. * * This may change the currently active tuple conversion map in * mtstate->mt_transition_capture, so the callers must take care to @@ -645,7 +654,9 @@ static TupleTableSlot * ExecInsert(ModifyTableContext *context, ResultRelInfo *resultRelInfo, TupleTableSlot *slot, - bool canSetTag) + bool canSetTag, + TupleTableSlot **inserted_tuple, + ResultRelInfo **insert_destrel) { ModifyTableState *mtstate = context->mtstate; EState *estate = context->estate; @@ -1008,11 +1019,14 @@ ExecInsert(ModifyTableContext *context, if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture && mtstate->mt_transition_capture->tcs_update_new_table) { - ExecARUpdateTriggers(estate, resultRelInfo, NULL, + ExecARUpdateTriggers(estate, resultRelInfo, + NULL, NULL, + NULL, NULL, slot, NULL, - mtstate->mt_transition_capture); + mtstate->mt_transition_capture, + false); /* * We've already captured the NEW TABLE row, so make sure any AR @@ -1046,6 +1060,11 @@ ExecInsert(ModifyTableContext *context, if (resultRelInfo->ri_projectReturning) result = ExecProcessReturning(resultRelInfo, slot, planSlot); + if (inserted_tuple) + *inserted_tuple = slot; + if (insert_destrel) + *insert_destrel = resultRelInfo; + return result; } @@ -1160,7 +1179,7 @@ ExecDeleteAct(ModifyTableContext *context, ResultRelInfo *resultRelInfo, */ static void ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, - ItemPointer tupleid, HeapTuple oldtuple) + ItemPointer tupleid, HeapTuple oldtuple, bool changingPart) { ModifyTableState *mtstate = context->mtstate; EState *estate = context->estate; @@ -1176,8 +1195,11 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture && mtstate->mt_transition_capture->tcs_update_old_table) { - ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, - NULL, NULL, mtstate->mt_transition_capture); + ExecARUpdateTriggers(estate, resultRelInfo, + NULL, NULL, + tupleid, oldtuple, + NULL, NULL, mtstate->mt_transition_capture, + false); /* * We've already captured the NEW TABLE row, so make sure any AR @@ -1188,7 +1210,7 @@ ExecDeleteEpilogue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, - ar_delete_trig_tcs); + ar_delete_trig_tcs, changingPart); } /* ---------------------------------------------------------------- @@ -1457,7 +1479,7 @@ ldelete:; if (tupleDeleted) *tupleDeleted = true; - ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple); + ExecDeleteEpilogue(context, resultRelInfo, tupleid, oldtuple, changingPart); /* Process RETURNING if present and if requested */ if (processReturning && resultRelInfo->ri_projectReturning) @@ -1526,7 +1548,10 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, ResultRelInfo *resultRelInfo, ItemPointer tupleid, HeapTuple oldtuple, TupleTableSlot *slot, - bool canSetTag, UpdateContext *updateCxt) + bool canSetTag, + UpdateContext *updateCxt, + TupleTableSlot **inserted_tuple, + ResultRelInfo **insert_destrel) { ModifyTableState *mtstate = context->mtstate; EState *estate = mtstate->ps.state; @@ -1652,7 +1677,8 @@ ExecCrossPartitionUpdate(ModifyTableContext *context, /* Tuple routing starts from the root table. */ context->cpUpdateReturningSlot = - ExecInsert(context, mtstate->rootResultRelInfo, slot, canSetTag); + ExecInsert(context, mtstate->rootResultRelInfo, slot, canSetTag, + inserted_tuple, insert_destrel); /* * Reset the transition state that may possibly have been written by @@ -1793,6 +1819,9 @@ lreplace:; */ if (partition_constraint_failed) { + TupleTableSlot *inserted_tuple; + ResultRelInfo *insert_destrel; + /* * ExecCrossPartitionUpdate will first DELETE the row from the * partition it's currently in and then insert it back into the root @@ -1801,11 +1830,37 @@ lreplace:; */ if (ExecCrossPartitionUpdate(context, resultRelInfo, tupleid, oldtuple, slot, - canSetTag, updateCxt)) + canSetTag, updateCxt, + &inserted_tuple, + &insert_destrel)) { /* success! */ updateCxt->updated = true; updateCxt->crossPartUpdate = true; + + /* + * If the partitioned table being updated is referenced in foreign + * keys, queue up trigger events to check that none of them were + * violated. No special treatment is needed in + * non-cross-partition update situations, because the leaf + * partition's AR update triggers will take care of that. During + * cross-partition updates implemented as delete on the source + * partition followed by insert on the destination partition, + * AR-UPDATE triggers of the root table (that is, the table + * mentioned in the query) must be fired. + * + * NULL insert_destrel means that the move failed to occur, that + * is, the update failed, so no need to anything in that case. + */ + if (insert_destrel && + resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_after_row) + ExecCrossPartitionUpdateForeignKey(context, + resultRelInfo, + insert_destrel, + tupleid, slot, + inserted_tuple); + return TM_Ok; } @@ -1871,11 +1926,13 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, + NULL, NULL, tupleid, oldtuple, slot, recheckIndexes, mtstate->operation == CMD_INSERT ? mtstate->mt_oc_transition_capture : - mtstate->mt_transition_capture); + mtstate->mt_transition_capture, + false); /* * Check any WITH CHECK OPTION constraints from parent views. We are @@ -1891,6 +1948,74 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, slot, context->estate); } +/* + * Queues up an update event using the target root partitioned table's + * trigger to check that a cross-partition update hasn't broken any foreign + * keys pointing into it. + */ +static void +ExecCrossPartitionUpdateForeignKey(ModifyTableContext *context, + ResultRelInfo *sourcePartInfo, + ResultRelInfo *destPartInfo, + ItemPointer tupleid, + TupleTableSlot *oldslot, + TupleTableSlot *newslot) +{ + ListCell *lc; + ResultRelInfo *rootRelInfo; + List *ancestorRels; + + rootRelInfo = sourcePartInfo->ri_RootResultRelInfo; + ancestorRels = ExecGetAncestorResultRels(context->estate, sourcePartInfo); + + /* + * For any foreign keys that point directly into a non-root ancestors of + * the source partition, we can in theory fire an update event to enforce + * those constraints using their triggers, if we could tell that both the + * source and the destination partitions are under the same ancestor. But + * for now, we simply report an error that those cannot be enforced. + */ + foreach(lc, ancestorRels) + { + ResultRelInfo *rInfo = lfirst(lc); + TriggerDesc *trigdesc = rInfo->ri_TrigDesc; + bool has_noncloned_fkey = false; + + /* Root ancestor's triggers will be processed. */ + if (rInfo == rootRelInfo) + continue; + + if (trigdesc && trigdesc->trig_update_after_row) + { + for (int i = 0; i < trigdesc->numtriggers; i++) + { + Trigger *trig = &trigdesc->triggers[i]; + + if (!trig->tgisclone && + RI_FKey_trigger_type(trig->tgfoid) == RI_TRIGGER_PK) + { + has_noncloned_fkey = true; + break; + } + } + } + + if (has_noncloned_fkey) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot move tuple across partitions when a non-root ancestor of the source partition is directly referenced in a foreign key"), + errdetail("A foreign key points to ancestor \"%s\", but not the root ancestor \"%s\".", + RelationGetRelationName(rInfo->ri_RelationDesc), + RelationGetRelationName(rootRelInfo->ri_RelationDesc)), + errhint("Consider defining the foreign key on \"%s\".", + RelationGetRelationName(rootRelInfo->ri_RelationDesc)))); + } + + /* Perform the root table's triggers. */ + ExecARUpdateTriggers(context->estate, + rootRelInfo, sourcePartInfo, destPartInfo, + tupleid, NULL, newslot, NIL, NULL, true); +} /* ---------------------------------------------------------------- * ExecUpdate @@ -2745,7 +2870,7 @@ ExecModifyTable(PlanState *pstate) ExecInitInsertProjection(node, resultRelInfo); slot = ExecGetInsertNewTuple(resultRelInfo, planSlot); slot = ExecInsert(&context, resultRelInfo, slot, - node->canSetTag); + node->canSetTag, NULL, NULL); break; case CMD_UPDATE: diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index c95cd324028..01d4c22cfce 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -1261,6 +1261,12 @@ RI_FKey_fk_upd_check_required(Trigger *trigger, Relation fk_rel, TransactionId xmin; bool isnull; + /* + * AfterTriggerSaveEvent() handles things such that this function is never + * called for partitioned tables. + */ + Assert(fk_rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE); + riinfo = ri_FetchConstraintInfo(trigger, fk_rel, false); ri_nullcheck = ri_NullCheck(RelationGetDescr(fk_rel), newslot, riinfo, false); |