You can subscribe to this list here.
2010 |
Jan
|
Feb
|
Mar
|
Apr
(4) |
May
(28) |
Jun
(12) |
Jul
(11) |
Aug
(12) |
Sep
(5) |
Oct
(19) |
Nov
(14) |
Dec
(12) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
(18) |
Feb
(30) |
Mar
(115) |
Apr
(89) |
May
(50) |
Jun
(44) |
Jul
(22) |
Aug
(13) |
Sep
(11) |
Oct
(30) |
Nov
(28) |
Dec
(39) |
2012 |
Jan
(38) |
Feb
(18) |
Mar
(43) |
Apr
(91) |
May
(108) |
Jun
(46) |
Jul
(37) |
Aug
(44) |
Sep
(33) |
Oct
(29) |
Nov
(36) |
Dec
(15) |
2013 |
Jan
(35) |
Feb
(611) |
Mar
(5) |
Apr
(55) |
May
(30) |
Jun
(28) |
Jul
(458) |
Aug
(34) |
Sep
(9) |
Oct
(39) |
Nov
(22) |
Dec
(32) |
2014 |
Jan
(16) |
Feb
(16) |
Mar
(42) |
Apr
(179) |
May
(7) |
Jun
(6) |
Jul
(9) |
Aug
|
Sep
(4) |
Oct
|
Nov
(3) |
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
(2) |
May
(4) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
S | M | T | W | T | F | S |
---|---|---|---|---|---|---|
|
|
|
1
|
2
(1) |
3
|
4
|
5
|
6
|
7
|
8
|
9
(4) |
10
(1) |
11
|
12
|
13
(1) |
14
(1) |
15
|
16
(1) |
17
|
18
|
19
|
20
|
21
(1) |
22
(1) |
23
(1) |
24
|
25
|
26
|
27
|
28
|
29
|
30
|
31
|
|
From: mason_s <ma...@us...> - 2010-12-23 20:18:12
|
Project "Postgres-XC". The branch, master has been updated via 45e1d4e389e966d072aaf98a49d9702aa253d976 (commit) from 0ab9bbc7600c157618d566f4d9985399e446519d (commit) - Log ----------------------------------------------------------------- commit 45e1d4e389e966d072aaf98a49d9702aa253d976 Author: Mason Sharp <ma...@us...> Date: Thu Dec 23 15:10:38 2010 -0500 Add support for single-step prepared statements. Works for both named and unnamed prepared statements, works for PREPARE and EXECUTE commands. The Coordinator tracks a list of the prepared statements, and prepares them in turn on Data Nodes, and only on demand, when they are first executed on the target node(s). At the end of a transaction, if there are still prepared statements that exist for the session, the connections are not released to the pool. (We should do something similar for temporary tables.) This commit also changes an existing kluge with using the SQL string in some cases, and now deparses from the Query tree instead. Written by Andrei Martsinchyk, multi-step check added by me. diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index 58f9845..9ef6f05 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -33,7 +33,11 @@ #include "utils/builtins.h" #include "utils/memutils.h" #include "utils/snapmgr.h" - +#ifdef PGXC +#include "pgxc/pgxc.h" +#include "pgxc/poolmgr.h" +#include "pgxc/execRemote.h" +#endif /* * The hash table in which prepared queries are stored. This is @@ -42,6 +46,14 @@ * (statement names); the entries are PreparedStatement structs. */ static HTAB *prepared_queries = NULL; +#ifdef PGXC +/* + * The hash table where datanode prepared statements are stored. + * The keys are statement names referenced from cached RemoteQuery nodes; the + * entries are DatanodeStatement structs + */ +static HTAB *datanode_queries = NULL; +#endif static void InitQueryHashTable(void); static ParamListInfo EvaluateParams(PreparedStatement *pstmt, List *params, @@ -147,6 +159,22 @@ PrepareQuery(PrepareStmt *stmt, const char *queryString) /* Generate plans for queries. */ plan_list = pg_plan_queries(query_list, 0, NULL); +#ifdef PGXC + /* + * Check if we are dealing with more than one step. + * Multi-step preapred statements are not yet supported. + * PGXCTODO - temporary - Once we add support, this code should be removed. + */ + if (IS_PGXC_COORDINATOR && plan_list && plan_list->head) + { + PlannedStmt *stmt = (PlannedStmt *) lfirst(plan_list->head); + + if (stmt->planTree->lefttree || stmt->planTree->righttree) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PSTATEMENT_DEFINITION), + errmsg("Multi-step Prepared Statements not yet supported"))); + } +#endif /* * Save the results. */ @@ -419,7 +447,76 @@ InitQueryHashTable(void) 32, &hash_ctl, HASH_ELEM); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + + hash_ctl.keysize = NAMEDATALEN; + hash_ctl.entrysize = sizeof(DatanodeStatement) + NumDataNodes * sizeof(int); + + datanode_queries = hash_create("Datanode Queries", + 64, + &hash_ctl, + HASH_ELEM); + } +#endif +} + +#ifdef PGXC +/* + * Assign the statement name for all the RemoteQueries in the plan tree, so + * they use datanode statements + */ +static int +set_remote_stmtname(Plan *plan, const char *stmt_name, int n) +{ + if (IsA(plan, RemoteQuery)) + { + DatanodeStatement *entry; + bool exists; + + char name[NAMEDATALEN]; + do + { + strcpy(name, stmt_name); + /* + * Append modifier. If resulting string is going to be truncated, + * truncate better the base string, otherwise we may enter endless + * loop + */ + if (n) + { + char modifier[NAMEDATALEN]; + sprintf(modifier, "__%d", n); + /* + * if position NAMEDATALEN - strlen(modifier) - 1 is beyond the + * base string this is effectively noop, otherwise it truncates + * the base string + */ + name[NAMEDATALEN - strlen(modifier) - 1] = '\0'; + strcat(name, modifier); + } + n++; + hash_search(datanode_queries, name, HASH_FIND, &exists); + } while (exists); + ((RemoteQuery *) plan)->statement = pstrdup(name); + entry = (DatanodeStatement *) hash_search(datanode_queries, + name, + HASH_ENTER, + NULL); + entry->nodenum = 0; + } + + if (innerPlan(plan)) + n = set_remote_stmtname(innerPlan(plan), stmt_name, n); + + if (outerPlan(plan)) + n = set_remote_stmtname(outerPlan(plan), stmt_name, n); + + return n; } +#endif /* * Store all the data pertaining to a query in the hash table using @@ -459,6 +556,25 @@ StorePreparedStatement(const char *stmt_name, errmsg("prepared statement \"%s\" already exists", stmt_name))); +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + { + ListCell *lc; + int n; + + /* + * Scan the plans and set the statement field for all found RemoteQuery + * nodes so they use data node statements + */ + n = 0; + foreach(lc, stmt_list) + { + PlannedStmt *ps = (PlannedStmt *) lfirst(lc); + n = set_remote_stmtname(ps->planTree, stmt_name, n); + } + } +#endif + /* Create a plancache entry */ plansource = CreateCachedPlan(raw_parse_tree, query_string, @@ -840,3 +956,114 @@ build_regtype_array(Oid *param_types, int num_params) result = construct_array(tmp_ary, num_params, REGTYPEOID, 4, true, 'i'); return PointerGetDatum(result); } + + +#ifdef PGXC +DatanodeStatement * +FetchDatanodeStatement(const char *stmt_name, bool throwError) +{ + DatanodeStatement *entry; + + /* + * If the hash table hasn't been initialized, it can't be storing + * anything, therefore it couldn't possibly store our plan. + */ + if (datanode_queries) + entry = (DatanodeStatement *) hash_search(datanode_queries, + stmt_name, + HASH_FIND, + NULL); + else + entry = NULL; + + /* Report error if entry is not found */ + if (!entry && throwError) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_PSTATEMENT), + errmsg("datanode statement \"%s\" does not exist", + stmt_name))); + + return entry; +} + +/* + * Drop datanode statement and close it on nodes if active + */ +void +DropDatanodeStatement(const char *stmt_name) +{ + DatanodeStatement *entry; + + entry = FetchDatanodeStatement(stmt_name, false); + if (entry) + { + int i; + List *nodelist = NIL; + + /* make a List of integers from node numbers */ + for (i = 0; i < entry->nodenum; i++) + nodelist = lappend_int(nodelist, entry->nodes[i]); + entry->nodenum = 0; + + ExecCloseRemoteStatement(stmt_name, nodelist); + + hash_search(datanode_queries, entry->stmt_name, HASH_REMOVE, NULL); + } +} + + +/* + * Return true if there is at least one active datanode statement, so acquired + * datanode connections should not be released + */ +bool +HaveActiveDatanodeStatements(void) +{ + HASH_SEQ_STATUS seq; + DatanodeStatement *entry; + + /* nothing cached */ + if (!datanode_queries) + return false; + + /* walk over cache */ + hash_seq_init(&seq, datanode_queries); + while ((entry = hash_seq_search(&seq)) != NULL) + { + /* Stop walking and return true */ + if (entry->nodenum > 0) + { + hash_seq_term(&seq); + return true; + } + } + /* nothing found */ + return false; +} + + +/* + * Mark datanode statement as active on specified node + * Return true if statement has already been active on the node and can be used + * Returns falsee if statement has not been active on the node and should be + * prepared on the node + */ +bool +ActivateDatanodeStatementOnNode(const char *stmt_name, int node) +{ + DatanodeStatement *entry; + int i; + + /* find the statement in cache */ + entry = FetchDatanodeStatement(stmt_name, true); + + /* see if statement already active on the node */ + for (i = 0; i < entry->nodenum; i++) + if (entry->nodes[i] == node) + return true; + + /* statement is not active on the specified node append item to the list */ + entry->nodes[entry->nodenum++] = node; + return false; +} +#endif diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c index 3a65361..ec33781 100644 --- a/src/backend/executor/execTuples.c +++ b/src/backend/executor/execTuples.c @@ -793,6 +793,90 @@ ExecCopySlotMinimalTuple(TupleTableSlot *slot) slot->tts_isnull); } +#ifdef PGXC +/* -------------------------------- + * ExecCopySlotDatarow + * Obtain a copy of a slot's data row. The copy is + * palloc'd in the current memory context. + * Pointer to the datarow is returned as a var parameter, function + * returns the length of the data row + * The slot itself is undisturbed + * -------------------------------- + */ +int +ExecCopySlotDatarow(TupleTableSlot *slot, char **datarow) +{ + Assert(datarow); + + if (slot->tts_dataRow) + { + /* if we already have datarow make a copy */ + *datarow = (char *)palloc(slot->tts_dataLen); + memcpy(*datarow, slot->tts_dataRow, slot->tts_dataLen); + return slot->tts_dataLen; + } + else + { + TupleDesc tdesc = slot->tts_tupleDescriptor; + StringInfoData buf; + uint16 n16; + int i; + + initStringInfo(&buf); + /* Number of parameter values */ + n16 = htons(tdesc->natts); + appendBinaryStringInfo(&buf, (char *) &n16, 2); + + /* ensure we have all values */ + slot_getallattrs(slot); + for (i = 0; i < tdesc->natts; i++) + { + uint32 n32; + + if (slot->tts_isnull[i]) + { + n32 = htonl(-1); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + } + else + { + Form_pg_attribute attr = tdesc->attrs[i]; + Oid typOutput; + bool typIsVarlena; + Datum pval; + char *pstring; + int len; + + /* Get info needed to output the value */ + getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena); + /* + * If we have a toasted datum, forcibly detoast it here to avoid + * memory leakage inside the type's output routine. + */ + if (typIsVarlena) + pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i])); + else + pval = slot->tts_values[i]; + + /* Convert Datum to string */ + pstring = OidOutputFunctionCall(typOutput, pval); + + /* copy data to the buffer */ + len = strlen(pstring); + n32 = htonl(len); + appendBinaryStringInfo(&buf, (char *) &n32, 4); + appendBinaryStringInfo(&buf, pstring, len); + } + } + /* copy data to the buffer */ + *datarow = palloc(buf.len); + memcpy(*datarow, buf.data, buf.len); + pfree(buf.data); + return buf.len; + } +} +#endif + /* -------------------------------- * ExecFetchSlotTuple * Fetch the slot's regular physical tuple. diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index b184987..ad227f4 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -839,12 +839,16 @@ _copyRemoteQuery(RemoteQuery *from) COPY_NODE_FIELD(distinct); COPY_SCALAR_FIELD(read_only); COPY_SCALAR_FIELD(force_autocommit); + COPY_STRING_FIELD(statement); COPY_STRING_FIELD(cursor); + COPY_SCALAR_FIELD(exec_type); + COPY_SCALAR_FIELD(paramval_data); + COPY_SCALAR_FIELD(paramval_len); COPY_STRING_FIELD(relname); COPY_SCALAR_FIELD(remotejoin); - COPY_SCALAR_FIELD(reduce_level); - COPY_NODE_FIELD(base_tlist); + COPY_SCALAR_FIELD(reduce_level); + COPY_NODE_FIELD(base_tlist); COPY_STRING_FIELD(outer_alias); COPY_STRING_FIELD(inner_alias); COPY_SCALAR_FIELD(outer_reduce_level); @@ -867,6 +871,9 @@ _copyExecNodes(ExecNodes *from) COPY_NODE_FIELD(nodelist); COPY_SCALAR_FIELD(baselocatortype); COPY_SCALAR_FIELD(tableusagetype); + COPY_NODE_FIELD(expr); + COPY_SCALAR_FIELD(relid); + COPY_SCALAR_FIELD(accesstype); return newnode; } @@ -2305,7 +2312,9 @@ _copyQuery(Query *from) COPY_NODE_FIELD(limitCount); COPY_NODE_FIELD(rowMarks); COPY_NODE_FIELD(setOperations); - +#ifdef PGXC + COPY_STRING_FIELD(sql_statement); +#endif return newnode; } diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 790b81d..4442310 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -354,22 +354,15 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, case LOCATOR_TYPE_HASH: if (partValue != NULL) - { /* in prototype, all partitioned tables use same map */ exec_nodes->nodelist = lappend_int(NULL, get_node_from_hash(hash_range_int(*partValue))); - } else - { - /* If no info, go to node 1 */ if (accessType == RELATION_ACCESS_INSERT) + /* Insert NULL to node 1 */ exec_nodes->nodelist = lappend_int(NULL, 1); else - /* - * No partitioning value passed in - * (no where qualification on part column - use all) - */ + /* Use all nodes for other types of access */ exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); - } break; case LOCATOR_TYPE_SINGLE: diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 8d900f1..1a56b44 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -58,6 +58,19 @@ typedef struct long constant; /* assume long PGXCTODO - should be Datum */ } Literal_Comparison; +/* + * Comparison of partitioned column and expression + * Expression can be evaluated at execution time to determine target nodes + */ +typedef struct +{ + Oid relid; + RelationLocInfo *rel_loc_info; + Oid attrnum; + char *col_name; + Expr *expr; /* assume long PGXCTODO - should be Datum */ +} Expr_Comparison; + /* Parent-Child joins for relations being joined on * their respective hash distribuion columns */ @@ -75,6 +88,7 @@ typedef struct typedef struct { List *partitioned_literal_comps; /* List of Literal_Comparison */ + List *partitioned_expressions; /* List of Expr_Comparison */ List *partitioned_parent_child; /* List of Parent_Child_Join */ List *replicated_joins; @@ -127,6 +141,7 @@ typedef struct XCWalkerContext Query *query; RelationAccessType accessType; RemoteQuery *query_step; /* remote query step being analized */ + PlannerInfo *root; /* planner data for the subquery */ Special_Conditions *conditions; bool multilevel_join; List *rtables; /* a pointer to a list of rtables */ @@ -144,11 +159,12 @@ bool StrictStatementChecking = true; /* Forbid multi-node SELECT statements with an ORDER BY clause */ bool StrictSelectChecking = false; -static void get_plan_nodes(Query *query, RemoteQuery *step, RelationAccessType accessType); +static void get_plan_nodes(PlannerInfo *root, RemoteQuery *step, RelationAccessType accessType); static bool get_plan_nodes_walker(Node *query_node, XCWalkerContext *context); static bool examine_conditions_walker(Node *expr_node, XCWalkerContext *context); static int handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stmt); static void InitXCWalkerContext(XCWalkerContext *context); +static RemoteQuery *makeRemoteQuery(void); static void validate_part_col_updatable(const Query *query); static bool is_pgxc_safe_func(Oid funcid); @@ -307,6 +323,7 @@ free_special_relations(Special_Conditions *special_conditions) /* free all items in list, including Literal_Comparison struct */ list_free_deep(special_conditions->partitioned_literal_comps); + list_free_deep(special_conditions->partitioned_expressions); /* free list, but not items pointed to */ list_free(special_conditions->partitioned_parent_child); @@ -451,8 +468,9 @@ get_base_var(Var *var, XCWalkerContext *context) * then the caller should use the regular PG planner */ static void -get_plan_nodes_insert(Query *query, RemoteQuery *step) +get_plan_nodes_insert(PlannerInfo *root, RemoteQuery *step) { + Query *query = root->parse; RangeTblEntry *rte; RelationLocInfo *rel_loc_info; Const *constant; @@ -502,15 +520,15 @@ get_plan_nodes_insert(Query *query, RemoteQuery *step) if (sub_rte->rtekind == RTE_SUBQUERY && !sub_rte->subquery->limitCount && !sub_rte->subquery->limitOffset) - get_plan_nodes(sub_rte->subquery, step, RELATION_ACCESS_READ); + get_plan_nodes(root, step, RELATION_ACCESS_READ); } /* Send to general planner if the query is multiple step */ if (!step->exec_nodes) return; - /* If the source is not hash-based (eg, replicated) also send - * through general planner + /* If the source is not hash-based (eg, replicated) also send + * through general planner */ if (step->exec_nodes->baselocatortype != LOCATOR_TYPE_HASH) { @@ -612,7 +630,18 @@ get_plan_nodes_insert(Query *query, RemoteQuery *step) } if (checkexpr == NULL) - return; /* no constant */ + { + /* try and determine nodes on execution time */ + step->exec_nodes = makeNode(ExecNodes); + step->exec_nodes->baselocatortype = rel_loc_info->locatorType; + step->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_USER; + step->exec_nodes->primarynodelist = NULL; + step->exec_nodes->nodelist = NULL; + step->exec_nodes->expr = eval_expr; + step->exec_nodes->relid = rel_loc_info->relid; + step->exec_nodes->accesstype = RELATION_ACCESS_INSERT; + return; + } constant = (Const *) checkexpr; @@ -788,7 +817,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) initStringInfo(&buf); /* Step 1: select tuple values by ctid */ - step1 = makeNode(RemoteQuery); + step1 = makeRemoteQuery(); appendStringInfoString(&buf, "SELECT "); for (att = 1; att <= natts; att++) { @@ -822,13 +851,11 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) appendStringInfo(&buf, " FROM %s WHERE ctid = '%s'", tableName, ctid_str); step1->sql_statement = pstrdup(buf.data); - step1->is_single_step = true; step1->exec_nodes = makeNode(ExecNodes); - step1->read_only = true; step1->exec_nodes->nodelist = list_make1_int(nodenum); /* Step 2: declare cursor for update target table */ - step2 = makeNode(RemoteQuery); + step2 = makeRemoteQuery(); resetStringInfo(&buf); appendStringInfoString(&buf, step->cursor); @@ -852,18 +879,14 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) } appendStringInfoString(&buf, "FOR UPDATE"); step2->sql_statement = pstrdup(buf.data); - step2->is_single_step = true; - step2->read_only = true; step2->exec_nodes = makeNode(ExecNodes); step2->exec_nodes->nodelist = list_copy(rel_loc_info1->nodeList); innerPlan(step2) = (Plan *) step1; /* Step 3: move cursor to first position */ - step3 = makeNode(RemoteQuery); + step3 = makeRemoteQuery(); resetStringInfo(&buf); appendStringInfo(&buf, "MOVE %s", node_cursor); step3->sql_statement = pstrdup(buf.data); - step3->is_single_step = true; - step3->read_only = true; step3->exec_nodes = makeNode(ExecNodes); step3->exec_nodes->nodelist = list_copy(rel_loc_info1->nodeList); innerPlan(step3) = (Plan *) step2; @@ -1024,7 +1047,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) if (!IsA(arg2, Const)) { /* this gets freed when the memory context gets freed */ - Expr *eval_expr = (Expr *) eval_const_expressions(NULL, (Node *) arg2); + Expr *eval_expr = (Expr *) eval_const_expressions(context->root, (Node *) arg2); checkexpr = get_numeric_constant(eval_expr); } @@ -1176,6 +1199,32 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) */ return false; } + /* + * Check if it is an expression like pcol = expr, where pcol is + * a partitioning column of the rel1 and planner could not + * evaluate expr. We probably can evaluate it at execution time. + * Save the expression, and if we do not have other hint, + * try and evaluate it at execution time + */ + rel_loc_info1 = GetRelationLocInfo(column_base->relid); + + if (!rel_loc_info1) + return true; + + if (IsHashColumn(rel_loc_info1, column_base->colname)) + { + Expr_Comparison *expr_comp = + palloc(sizeof(Expr_Comparison)); + + expr_comp->relid = column_base->relid; + expr_comp->rel_loc_info = rel_loc_info1; + expr_comp->col_name = column_base->colname; + expr_comp->expr = arg2; + context->conditions->partitioned_expressions = + lappend(context->conditions->partitioned_expressions, + expr_comp); + return false; + } } } } @@ -1599,24 +1648,19 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) } if (rtesave) - { /* a single table, just grab it */ rel_loc_info = GetRelationLocInfo(rtesave->relid); + } - if (!rel_loc_info) - return true; + /* have complex case */ + if (!rel_loc_info) + return true; - context->query_step->exec_nodes = GetRelationNodes(rel_loc_info, - NULL, - context->accessType); - } - } - else - { + if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + /* do not need to determine partitioning expression */ context->query_step->exec_nodes = GetRelationNodes(rel_loc_info, NULL, context->accessType); - } /* Note replicated table usage for determining safe queries */ if (context->query_step->exec_nodes) @@ -1625,6 +1669,38 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) table_usage_type = TABLE_USAGE_TYPE_USER_REPLICATED; context->query_step->exec_nodes->tableusagetype = table_usage_type; + } else if (context->conditions->partitioned_expressions) { + /* probably we can determine nodes on execution time */ + foreach(lc, context->conditions->partitioned_expressions) { + Expr_Comparison *expr_comp = (Expr_Comparison *) lfirst(lc); + if (rel_loc_info->relid == expr_comp->relid) + { + context->query_step->exec_nodes = makeNode(ExecNodes); + context->query_step->exec_nodes->baselocatortype = + rel_loc_info->locatorType; + context->query_step->exec_nodes->tableusagetype = + TABLE_USAGE_TYPE_USER; + context->query_step->exec_nodes->primarynodelist = NULL; + context->query_step->exec_nodes->nodelist = NULL; + context->query_step->exec_nodes->expr = expr_comp->expr; + context->query_step->exec_nodes->relid = expr_comp->relid; + context->query_step->exec_nodes->accesstype = context->accessType; + break; + } + } + } else { + /* run query on all nodes */ + context->query_step->exec_nodes = makeNode(ExecNodes); + context->query_step->exec_nodes->baselocatortype = + rel_loc_info->locatorType; + context->query_step->exec_nodes->tableusagetype = + TABLE_USAGE_TYPE_USER; + context->query_step->exec_nodes->primarynodelist = NULL; + context->query_step->exec_nodes->nodelist = + list_copy(rel_loc_info->nodeList); + context->query_step->exec_nodes->expr = NULL; + context->query_step->exec_nodes->relid = NULL; + context->query_step->exec_nodes->accesstype = context->accessType; } } /* check for partitioned col comparison against a literal */ @@ -1712,6 +1788,7 @@ InitXCWalkerContext(XCWalkerContext *context) context->query = NULL; context->accessType = RELATION_ACCESS_READ; context->query_step = NULL; + context->root = NULL; context->conditions = (Special_Conditions *) palloc0(sizeof(Special_Conditions)); context->rtables = NIL; context->multilevel_join = false; @@ -1722,20 +1799,57 @@ InitXCWalkerContext(XCWalkerContext *context) context->join_list = NIL; } + +/* + * Create an instance of RemoteQuery and initialize fields + */ +static RemoteQuery * +makeRemoteQuery(void) +{ + RemoteQuery *result = makeNode(RemoteQuery); + result->is_single_step = true; + result->sql_statement = NULL; + result->exec_nodes = NULL; + result->combine_type = COMBINE_TYPE_NONE; + result->simple_aggregates = NIL; + result->sort = NULL; + result->distinct = NULL; + result->read_only = true; + result->force_autocommit = false; + result->cursor = NULL; + result->exec_type = EXEC_ON_DATANODES; + result->paramval_data = NULL; + result->paramval_len = 0; + + result->relname = NULL; + result->remotejoin = false; + result->partitioned_replicated = false; + result->reduce_level = 0; + result->base_tlist = NIL; + result->outer_alias = NULL; + result->inner_alias = NULL; + result->outer_reduce_level = 0; + result->inner_reduce_level = 0; + result->outer_relids = NULL; + result->inner_relids = NULL; + return result; +} + /* * Top level entry point before walking query to determine plan nodes * */ static void -get_plan_nodes(Query *query, RemoteQuery *step, RelationAccessType accessType) +get_plan_nodes(PlannerInfo *root, RemoteQuery *step, RelationAccessType accessType) { + Query *query = root->parse; XCWalkerContext context; - InitXCWalkerContext(&context); context.query = query; context.accessType = accessType; context.query_step = step; + context.root = root; context.rtables = lappend(context.rtables, query->rtable); if ((get_plan_nodes_walker((Node *) query, &context) @@ -1754,24 +1868,24 @@ get_plan_nodes(Query *query, RemoteQuery *step, RelationAccessType accessType) * */ static void -get_plan_nodes_command(Query *query, RemoteQuery *step) +get_plan_nodes_command(RemoteQuery *step, PlannerInfo *root) { - switch (query->commandType) + switch (root->parse->commandType) { case CMD_SELECT: - get_plan_nodes(query, step, query->rowMarks ? + get_plan_nodes(root, step, root->parse->rowMarks ? RELATION_ACCESS_READ_FOR_UPDATE : RELATION_ACCESS_READ); break; case CMD_INSERT: - get_plan_nodes_insert(query, step); + get_plan_nodes_insert(root, step); break; case CMD_UPDATE: case CMD_DELETE: /* treat as a select */ - get_plan_nodes(query, step, RELATION_ACCESS_UPDATE); + get_plan_nodes(root, step, RELATION_ACCESS_UPDATE); break; default: @@ -2589,9 +2703,41 @@ PlannedStmt * pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) { PlannedStmt *result; - Plan *standardPlan; + PlannerGlobal *glob; + PlannerInfo *root; RemoteQuery *query_step; + StringInfoData buf; + /* + * Set up global state for this planner invocation. This data is needed + * across all levels of sub-Query that might exist in the given command, + * so we keep it in a separate struct that's linked to by each per-Query + * PlannerInfo. + */ + glob = makeNode(PlannerGlobal); + + glob->boundParams = boundParams; + glob->paramlist = NIL; + glob->subplans = NIL; + glob->subrtables = NIL; + glob->rewindPlanIDs = NULL; + glob->finalrtable = NIL; + glob->relationOids = NIL; + glob->invalItems = NIL; + glob->lastPHId = 0; + glob->transientPlan = false; + + /* Create a PlannerInfo data structure, usually it is done for a subquery */ + root = makeNode(PlannerInfo); + root->parse = query; + root->glob = glob; + root->query_level = 1; + root->parent_root = NULL; + root->planner_cxt = CurrentMemoryContext; + root->init_plans = NIL; + root->cte_plan_ids = NIL; + root->eq_classes = NIL; + root->append_rel_list = NIL; /* build the PlannedStmt result */ result = makeNode(PlannedStmt); @@ -2603,184 +2749,151 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) result->intoClause = query->intoClause; result->rtable = query->rtable; - query_step = makeNode(RemoteQuery); - query_step->is_single_step = false; + query_step = makeRemoteQuery(); + + /* Optimize multi-node handling */ + query_step->read_only = query->commandType == CMD_SELECT; if (query->utilityStmt && IsA(query->utilityStmt, DeclareCursorStmt)) cursorOptions |= ((DeclareCursorStmt *) query->utilityStmt)->options; - query_step->exec_nodes = NULL; - query_step->combine_type = COMBINE_TYPE_NONE; - query_step->simple_aggregates = NULL; - /* Optimize multi-node handling */ - query_step->read_only = query->commandType == CMD_SELECT; - query_step->force_autocommit = false; - result->planTree = (Plan *) query_step; - /* - * Determine where to execute the command, either at the Coordinator - * level, Data Nodes, or both. By default we choose both. We should be - * able to quickly expand this for more commands. - */ - switch (query->commandType) - { - case CMD_SELECT: - /* Perform some checks to make sure we can support the statement */ - if (query->intoClause) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("INTO clause not yet supported")))); - /* fallthru */ - case CMD_INSERT: - case CMD_UPDATE: - case CMD_DELETE: - /* PGXCTODO: This validation will not be removed - * until we support moving tuples from one node to another - * when the partition column of a table is updated - */ - if (query->commandType == CMD_UPDATE) - validate_part_col_updatable(query); - - if (query->returningList) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("RETURNING clause not yet supported")))); - - /* Set result relations */ - if (query->commandType != CMD_SELECT) - result->resultRelations = list_make1_int(query->resultRelation); + /* Perform some checks to make sure we can support the statement */ + if (query->commandType == CMD_SELECT && query->intoClause) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("INTO clause not yet supported")))); - get_plan_nodes_command(query, query_step); + /* PGXCTODO: This validation will not be removed + * until we support moving tuples from one node to another + * when the partition column of a table is updated + */ + if (query->commandType == CMD_UPDATE) + validate_part_col_updatable(query); - if (query_step->exec_nodes == NULL) - { - /* Do not yet allow multi-node correlated UPDATE or DELETE */ - if (query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE) - { - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("UPDATE and DELETE that are correlated or use non-immutable functions not yet supported")))); - } + if (query->returningList) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("RETURNING clause not yet supported")))); - /* - * Processing guery against catalog tables, or multi-step command. - * Run through standard planner - */ - result = standard_planner(query, cursorOptions, boundParams); - return result; - } + /* Set result relations */ + if (query->commandType != CMD_SELECT) + result->resultRelations = list_make1_int(query->resultRelation); - /* Do not yet allow multi-node correlated UPDATE or DELETE */ - if ((query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE) - && !query_step->exec_nodes - && list_length(query->rtable) > 1) - { - result = standard_planner(query, cursorOptions, boundParams); - return result; - } + get_plan_nodes_command(query_step, root); - /* - * get_plan_nodes_command may alter original statement, so do not - * process it before the call - * - * Declare Cursor case: - * We should leave as a step query only SELECT statement - * Further if we need refer source statement for planning we should take - * the truncated string - */ - if (query->utilityStmt && - IsA(query->utilityStmt, DeclareCursorStmt)) - { + if (query_step->exec_nodes == NULL) + { + /* Do not yet allow multi-node correlated UPDATE or DELETE */ + if (query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE) + { + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("UPDATE and DELETE that are correlated or use non-immutable functions not yet supported")))); + } - /* search for SELECT keyword in the normalized string */ - char *select = strpos(query->sql_statement, " SELECT "); - /* Take substring of the original string using found offset */ - query_step->sql_statement = pstrdup(select + 1); - } - else - query_step->sql_statement = pstrdup(query->sql_statement); + /* + * Processing guery against catalog tables, or multi-step command. + * Run through standard planner + */ + result = standard_planner(query, cursorOptions, boundParams); + return result; + } - /* - * If there already is an active portal, we may be doing planning - * within a function. Just use the standard plan, but check if - * it is part of an EXPLAIN statement so that we do not show that - * we plan multiple steps when it is a single-step operation. - */ - if (ActivePortal && strcmp(ActivePortal->commandTag, "EXPLAIN")) - return standard_planner(query, cursorOptions, boundParams); + /* Do not yet allow multi-node correlated UPDATE or DELETE */ + if ((query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE) + && !query_step->exec_nodes + && list_length(query->rtable) > 1) + { + result = standard_planner(query, cursorOptions, boundParams); + return result; + } - query_step->is_single_step = true; - /* - * PGXCTODO - * When Postgres runs insert into t (a) values (1); against table - * defined as create table t (a int, b int); the plan is looking - * like insert into t (a,b) values (1,null); - * Later executor is verifying plan, to make sure table has not - * been altered since plan has been created and comparing table - * definition with plan target list and output error if they do - * not match. - * I could not find better way to generate targetList for pgxc plan - * then call standard planner and take targetList from the plan - * generated by Postgres. - */ - query_step->scan.plan.targetlist = query->targetList; + /* + * Deparse query tree to get step query. It may be modified later on + */ + initStringInfo(&buf); + deparse_query(query, &buf, NIL); + query_step->sql_statement = pstrdup(buf.data); + pfree(buf.data); - if (query_step->exec_nodes) - query_step->combine_type = get_plan_combine_type( - query, query_step->exec_nodes->baselocatortype); + query_step->is_single_step = true; + /* + * PGXCTODO + * When Postgres runs insert into t (a) values (1); against table + * defined as create table t (a int, b int); the plan is looking + * like insert into t (a,b) values (1,null); + * Later executor is verifying plan, to make sure table has not + * been altered since plan has been created and comparing table + * definition with plan target list and output error if they do + * not match. + * I could not find better way to generate targetList for pgxc plan + * then call standard planner and take targetList from the plan + * generated by Postgres. + */ + query_step->scan.plan.targetlist = query->targetList; - /* Set up simple aggregates */ - /* PGXCTODO - we should detect what types of aggregates are used. - * in some cases we can avoid the final step and merely proxy results - * (when there is only one data node involved) instead of using - * coordinator consolidation. At the moment this is needed for AVG() - */ - query_step->simple_aggregates = get_simple_aggregates(query); + if (query_step->exec_nodes) + query_step->combine_type = get_plan_combine_type( + query, query_step->exec_nodes->baselocatortype); + + /* Set up simple aggregates */ + /* PGXCTODO - we should detect what types of aggregates are used. + * in some cases we can avoid the final step and merely proxy results + * (when there is only one data node involved) instead of using + * coordinator consolidation. At the moment this is needed for AVG() + */ + query_step->simple_aggregates = get_simple_aggregates(query); - /* - * Add sorting to the step - */ - if (list_length(query_step->exec_nodes->nodelist) > 1 && - (query->sortClause || query->distinctClause)) - make_simple_sort_from_sortclauses(query, query_step); + /* + * Add sorting to the step + */ + if (list_length(query_step->exec_nodes->nodelist) > 1 && + (query->sortClause || query->distinctClause)) + make_simple_sort_from_sortclauses(query, query_step); - /* Handle LIMIT and OFFSET for single-step queries on multiple nodes */ - if (handle_limit_offset(query_step, query, result)) - { - /* complicated expressions, just fallback to standard plan */ - result = standard_planner(query, cursorOptions, boundParams); - return result; - } + /* Handle LIMIT and OFFSET for single-step queries on multiple nodes */ + if (handle_limit_offset(query_step, query, result)) + { + /* complicated expressions, just fallback to standard plan */ + result = standard_planner(query, cursorOptions, boundParams); + return result; + } - /* - * Use standard plan if we have more than one data node with either - * group by, hasWindowFuncs, or hasRecursive - */ - /* - * PGXCTODO - this could be improved to check if the first - * group by expression is the partitioning column, in which - * case it is ok to treat as a single step. - */ - if (query->commandType == CMD_SELECT - && query_step->exec_nodes - && list_length(query_step->exec_nodes->nodelist) > 1 - && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) - { - result->planTree = standardPlan; - return result; - } - break; + /* + * Use standard plan if we have more than one data node with either + * group by, hasWindowFuncs, or hasRecursive + */ + /* + * PGXCTODO - this could be improved to check if the first + * group by expression is the partitioning column, in which + * case it is ok to treat as a single step. + */ + if (query->commandType == CMD_SELECT + && query_step->exec_nodes + && list_length(query_step->exec_nodes->nodelist) > 1 + && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) + { + result = standard_planner(query, cursorOptions, boundParams); + return result; + } - default: - /* Allow for override */ - if (StrictStatementChecking) - ereport(ERROR, - (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("This command is not yet supported.")))); - else - result->planTree = standardPlan; + /* Allow for override */ + /* AM: Is this ever possible? */ + if (query->commandType != CMD_SELECT && + query->commandType != CMD_INSERT && + query->commandType != CMD_UPDATE && + query->commandType != CMD_DELETE) + { + if (StrictStatementChecking) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("This command is not yet supported.")))); + else + result = standard_planner(query, cursorOptions, boundParams); + return result; } /* @@ -2808,6 +2921,13 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) } /* + * Assume single step. If there are multiple steps we should make up + * parameters for each step where they referenced + */ + if (boundParams) + query_step->paramval_len = ParamListToDataRow(boundParams, + &query_step->paramval_data); + /* * If query is FOR UPDATE fetch CTIDs from the remote node * Use CTID as a key to update tuples on remote nodes when handling * WHERE CURRENT OF @@ -3068,7 +3188,7 @@ validate_part_col_updatable(const Query *query) * * Based on is_immutable_func from postgresql_fdw.c * We add an exeption for base postgresql functions, to - * allow now() and others to still execute as part of single step + * allow now() and others to still execute as part of single step * queries. * * PGXCTODO - we currently make the false assumption that immutable diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index b954003..a387354 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -19,6 +19,7 @@ #include "postgres.h" #include "access/gtm.h" #include "access/xact.h" +#include "commands/prepare.h" #include "executor/executor.h" #include "gtm/gtm_c.h" #include "libpq/libpq.h" @@ -27,6 +28,7 @@ #include "pgxc/poolmgr.h" #include "storage/ipc.h" #include "utils/datum.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/tuplesort.h" #include "utils/snapmgr.h" @@ -38,9 +40,6 @@ #define DATA_NODE_FETCH_SIZE 1 - -extern char *deparseSql(RemoteQueryState *scanstate); - /* * Buffer size does not affect performance significantly, just do not allow * connection buffer grows infinitely @@ -62,6 +61,9 @@ static int pgxc_node_rollback_prepared(GlobalTransactionId gxid, GlobalTransacti PGXCNodeAllHandles * pgxc_handles, char *gid); static int pgxc_node_commit_prepared(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid, PGXCNodeAllHandles * pgxc_handles, char *gid); +static PGXCNodeAllHandles * get_exec_connections(RemoteQueryState *planstate, + ExecNodes *exec_nodes, + RemoteQueryExecType exec_type); static int pgxc_node_implicit_commit_prepared(GlobalTransactionId prepare_xid, GlobalTransactionId commit_xid, PGXCNodeAllHandles * pgxc_handles, @@ -70,8 +72,6 @@ static int pgxc_node_implicit_commit_prepared(GlobalTransactionId prepare_xid, static int pgxc_node_implicit_prepare(GlobalTransactionId prepare_xid, PGXCNodeAllHandles * pgxc_handles, char *gid); -static PGXCNodeAllHandles * get_exec_connections(ExecNodes *exec_nodes, - RemoteQueryExecType exec_type); static int pgxc_node_receive_and_validate(const int conn_count, PGXCNodeHandle ** connections, bool reset_combiner); @@ -1265,10 +1265,10 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) /* * If we are in the process of shutting down, we - * may be rolling back, and the buffer may contain other messages. - * We want to avoid a procarray exception - * as well as an error stack overflow. - */ + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ if (proc_exit_inprogress) conn->state = DN_CONNECTION_STATE_ERROR_FATAL; @@ -1364,10 +1364,11 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) /* sync lost? */ elog(WARNING, "Received unsupported message type: %c", msg_type); conn->state = DN_CONNECTION_STATE_ERROR_FATAL; - return RESPONSE_EOF; + /* stop reading */ + return RESPONSE_COMPLETE; } } - + /* never happen, but keep compiler quiet */ return RESPONSE_EOF; } @@ -2746,7 +2747,6 @@ RemoteQueryState * ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) { RemoteQueryState *remotestate; - Relation currentRelation; remotestate = CreateResponseCombiner(0, node->combine_type); remotestate->ss.ps.plan = (Plan *) node; @@ -2788,6 +2788,19 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) ALLOCSET_DEFAULT_MAXSIZE); } + /* + * If we have parameter values here and planner has not had them we + * should prepare them now + */ + if (estate->es_param_list_info && !node->paramval_data) + node->paramval_len = ParamListToDataRow(estate->es_param_list_info, + &node->paramval_data); + + /* We need expression context to evaluate */ + if (node->exec_nodes && node->exec_nodes->expr) + ExecAssignExprContext(estate, &remotestate->ss.ps); + + if (innerPlan(node)) innerPlanState(remotestate) = ExecInitNode(innerPlan(node), estate, eflags); @@ -2853,7 +2866,8 @@ copy_slot(RemoteQueryState *node, TupleTableSlot *src, TupleTableSlot *dst) * Datanodes Only, Coordinators only or both types */ static PGXCNodeAllHandles * -get_exec_connections(ExecNodes *exec_nodes, +get_exec_connections(RemoteQueryState *planstate, + ExecNodes *exec_nodes, RemoteQueryExecType exec_type) { List *nodelist = NIL; @@ -2873,8 +2887,34 @@ get_exec_connections(ExecNodes *exec_nodes, if (exec_nodes) { - nodelist = exec_nodes->nodelist; - primarynode = exec_nodes->primarynodelist; + if (exec_nodes->expr) + { + /* execution time determining of target data nodes */ + bool isnull; + ExprState *estate = ExecInitExpr(exec_nodes->expr, + (PlanState *) planstate); + Datum partvalue = ExecEvalExpr(estate, + planstate->ss.ps.ps_ExprContext, + &isnull, + NULL); + if (!isnull) + { + RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->relid); + ExecNodes *nodes = GetRelationNodes(rel_loc_info, + (long *) &partvalue, + exec_nodes->accesstype); + if (nodes) + { + nodelist = nodes->nodelist; + primarynode = nodes->primarynodelist; + pfree(nodes); + } + FreeRelationLocInfo(rel_loc_info); + } + } else { + nodelist = exec_nodes->nodelist; + primarynode = exec_nodes->primarynodelist; + } } if (list_length(nodelist) == 0 && @@ -2961,212 +3001,273 @@ register_write_nodes(int conn_count, PGXCNodeHandle **connections) } } -/* - * Execute step of PGXC plan. - * The step specifies a command to be executed on specified nodes. - * On first invocation connections to the data nodes are initialized and - * command is executed. Further, as well as within subsequent invocations, - * responses are received until step is completed or there is a tuple to emit. - * If there is a tuple it is returned, otherwise returned NULL. The NULL result - * from the function indicates completed step. - * The function returns at most one tuple per invocation. - */ -TupleTableSlot * -ExecRemoteQuery(RemoteQueryState *node) + +static void +do_query(RemoteQueryState *node) { RemoteQuery *step = (RemoteQuery *) node->ss.ps.plan; - TupleTableSlot *resultslot = node->ss.ps.ps_ResultTupleSlot; TupleTableSlot *scanslot = node->ss.ss_ScanTupleSlot; - bool have_tuple = false; + bool force_autocommit = step->force_autocommit; + bool is_read_only = step->read_only; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + Snapshot snapshot = GetActiveSnapshot(); + TimestampTz timestamp = GetCurrentGTMStartTimestamp(); + PGXCNodeHandle **connections = NULL; + PGXCNodeHandle *primaryconnection = NULL; + int i; + int regular_conn_count; + int total_conn_count; + bool need_tran; + PGXCNodeAllHandles *pgxc_connections; + /* + * Get connections for Datanodes only, utilities and DDLs + * are launched in ExecRemoteUtility + */ + pgxc_connections = get_exec_connections(node, step->exec_nodes, + EXEC_ON_DATANODES); - if (!node->query_Done) + connections = pgxc_connections->datanode_handles; + primaryconnection = pgxc_connections->primary_handle; + total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count; + + /* + * Primary connection is counted separately but is included in total_conn_count if used. + */ + if (primaryconnection) { - /* First invocation, initialize */ - bool force_autocommit = step->force_autocommit; - bool is_read_only = step->read_only; - GlobalTransactionId gxid = InvalidGlobalTransactionId; - Snapshot snapshot = GetActiveSnapshot(); - TimestampTz timestamp = GetCurrentGTMStartTimestamp(); - PGXCNodeHandle **connections = NULL; - PGXCNodeHandle *primaryconnection = NULL; - int i; - int regular_conn_count; - int total_conn_count; - bool need_tran; - PGXCNodeAllHandles *pgxc_connections; - TupleTableSlot *innerSlot = NULL; - - implicit_force_autocommit = force_autocommit; + regular_conn_count--; + } - /* - * Inner plan for RemoteQuery supplies parameters. - * We execute inner plan to get a tuple and use values of the tuple as - * parameter values when executing this remote query. - * If returned slot contains NULL tuple break execution. - * TODO there is a problem how to handle the case if both inner and - * outer plans exist. We can decide later, since it is never used now. - */ - if (innerPlanState(node)) - { - innerSlot = ExecProcNode(innerPlanState(node)); -// if (TupIsNull(innerSlot)) -// return innerSlot; - } + pfree(pgxc_connections); - /* - * Get connections for Datanodes only, utilities and DDLs - * are launched in ExecRemoteUtility - */ - pgxc_connections = get_exec_connections(step->exec_nodes, - EXEC_ON_DATANODES); + /* + * We save only regular connections, at the time we exit the function + * we finish with the primary connection and deal only with regular + * connections on subsequent invocations + */ + node->node_count = regular_conn_count; - connections = pgxc_connections->datanode_handles; - primaryconnection = pgxc_connections->primary_handle; - total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count; + if (force_autocommit) + need_tran = false; + else + need_tran = !autocommit || (!is_read_only && total_conn_count > 1); - /* - * Primary connection is counted separately but is included in total_conn_count if used. - */ + elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, need_tran = %s", autocommit ? "true" : "false", primaryconnection ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); + + stat_statement(); + if (autocommit) + { + stat_transaction(total_conn_count); + /* We normally clear for transactions, but if autocommit, clear here, too */ + clear_write_node_list(); + } + + if (!is_read_only) + { if (primaryconnection) - { - regular_conn_count--; - } + register_write_nodes(1, &primaryconnection); + register_write_nodes(regular_conn_count, connections); + } + + gxid = GetCurrentGlobalTransactionId(); - pfree(pgxc_connections); + if (!GlobalTransactionIdIsValid(gxid)) + { + if (primaryconnection) + pfree(primaryconnection); + pfree(connections); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to get next transaction ID"))); + } + if (need_tran) + { /* - * We save only regular connections, at the time we exit the function - * we finish with the primary connection and deal only with regular - * connections on subsequent invocations + * Check if data node connections are in transaction and start + * transactions on nodes where it is not started */ - node->node_count = regular_conn_count; + PGXCNodeHandle *new_connections[total_conn_count]; + int new_count = 0; - if (force_autocommit) - need_tran = false; - else - need_tran = !autocommit || (!is_read_only && total_conn_count > 1); + if (primaryconnection && primaryconnection->transaction_status != 'T') + new_connections[new_count++] = primaryconnection; + for (i = 0; i < regular_conn_count; i++) + if (connections[i]->transaction_status != 'T') + new_connections[new_count++] = connections[i]; + + if (new_count && pgxc_node_begin(new_count, new_connections, gxid)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data nodes."))); + } - elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, need_tran = %s", autocommit ? "true" : "false", primaryconnection ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); + /* See if we have a primary node, execute on it first before the others */ + if (primaryconnection) + { + if (primaryconnection->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(primaryconnection); - stat_statement(); - if (autocommit) + /* If explicit transaction is needed gxid is already sent */ + if (!need_tran && pgxc_node_send_gxid(primaryconnection, gxid)) { - stat_transaction(total_conn_count); - /* We normally clear for transactions, but if autocommit, clear here, too */ - clear_write_node_list(); + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); } - - if (!is_read_only) + if (total_conn_count == 1 && pgxc_node_send_timestamp(primaryconnection, timestamp)) { - if (primaryconnection) - register_write_nodes(1, &primaryconnection); - register_write_nodes(regular_conn_count, connections); + /* + * If a transaction involves multiple connections timestamp is + * always sent down to Datanodes with pgxc_node_begin. + * An autocommit transaction needs the global timestamp also, + * so handle this case here. + */ + pfree(connections); + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); } - - gxid = GetCurrentGlobalTransactionId(); - - if (!GlobalTransactionIdIsValid(gxid)) + if (snapshot && pgxc_node_send_snapshot(primaryconnection, snapshot)) { - if (primaryconnection) - pfree(primaryconnection); pfree(connections); + pfree(primaryconnection); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to get next transaction ID"))); + errmsg("Failed to send command to data nodes"))); } - - if (need_tran) + if (step->statement || step->cursor || step->paramval_data) { + /* need to use Extended Query Protocol */ + int fetch = 0; + bool prepared = false; + + /* if prepared statement is referenced see if it is already exist */ + if (step->statement) + prepared = ActivateDatanodeStatementOnNode(step->statement, + primaryconnection->nodenum); /* - * Check if data node connections are in transaction and start - * transactions on nodes where it is not started + * execute and fetch rows only if they will be consumed + * immediately by the sorter */ - PGXCNodeHandle *new_connections[total_conn_count]; - int new_count = 0; - - if (primaryconnection && primaryconnection->transaction_status != 'T') - new_connections[new_count++] = primaryconnection; - for (i = 0; i < regular_conn_count; i++) - if (connections[i]->transaction_status != 'T') - new_connections[new_count++] = connections[i]; - - if (new_count && pgxc_node_begin(new_count, new_connections, gxid)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not begin transaction on data nodes."))); - } - - /* See if we have a primary node, execute on it first before the others */ - if (primaryconnection) - { - if (primaryconnection->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(primaryconnection); - - /* If explicit transaction is needed gxid is already sent */ - if (!need_tran && pgxc_node_send_gxid(primaryconnection, gxid)) + if (step->cursor) + fetch = 1; + + if (pgxc_node_send_query_extended(primaryconnection, + prepared ? NULL : step->sql_statement, + step->statement, + step->cursor, + step->paramval_len, + step->paramval_data, + step->read_only, + fetch) != 0) { pfree(connections); - pfree(primaryconnection); + if (primaryconnection) + pfree(primaryconnection); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to data nodes"))); } - if (total_conn_count == 1 && pgxc_node_send_timestamp(primaryconnection, timestamp)) + } + else + { + if (pgxc_node_send_query(primaryconnection, step->sql_statement) != 0) { - /* - * If a transaction involves multiple connections timestamp is - * always sent down to Datanodes with pgxc_node_begin. - * An autocommit transaction needs the global timestamp also, - * so handle this case here. - */ pfree(connections); pfree(primaryconnection); ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to data nodes"))); } - if (snapshot && pgxc_node_send_snapshot(primaryconnection, snapshot)) - { - pfree(connections); - pfree(primaryconnection); + } + primaryconnection->combiner = node; + Assert(node->combine_type == COMBINE_TYPE_SAME); + + while (node->command_complete_count < 1) + { + if (pgxc_node_receive(1, &primaryconnection, NULL)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to data nodes"))); - } - if (pgxc_node_send_query(primaryconnection, step->sql_statement) != 0) + errmsg("Failed to read response from data nodes"))); + handle_response(primaryconnection, node); + if (node->errorMessage) { - pfree(connections); - pfree(primaryconnection); + char *code = node->errorCode; ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send command to data nodes"))); - } - Assert(node->combine_type == COMBINE_TYPE_SAME); - - while (node->command_complete_count < 1) - { - if (pgxc_node_receive(1, &primaryconnection, NULL)) - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to read response from data nodes"))); - handle_response(primaryconnection, node); - if (node->errorMessage) - { - char *code = node->errorCode; - ereport(ERROR, - (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), - errmsg("%s", node->errorMessage))); - } + (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])), + errmsg("%s", node->errorMessage))); } } + } - for (i = 0; i < regular_conn_count; i++) + for (i = 0; i < regular_conn_count; i++) + { + if (connections[i]->state == DN_CONNECTION_STATE_QUERY) + BufferConnection(connections[i]); + /* If explicit transaction is needed gxid is already sent */ + if (!need_tran && pgxc_node_send_gxid(connections[i], gxid)) { - if (connections[i]->state == DN_CONNECTION_STATE_QUERY) - BufferConnection(connections[i]); - /* If explicit transaction is needed gxid is already sent */ - if (!need_tran && pgxc_node_send_gxid(connections[i], gxid)) + pfree(connections); + if (primaryconnection) + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (total_conn_count == 1 && pgxc_node_send_timestamp(connections[i], timestamp)) + { + /* + * If a transaction involves multiple connections timestamp is + * always sent down to Datanodes with pgxc_node_begin. + * An autocommit transaction needs the global timestamp also, + * so handle this case here. + */ + pfree(connections); + if (primaryconnection) + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot)) + { + pfree(connections); + if (primaryconnection) + pfree(primaryconnection); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send command to data nodes"))); + } + if (step->statement || step->cursor || step->paramval_data) + { + /* need to use Extended Query Protocol */ + int fetch = 0; + bool prepared = false; + + /* if prepared statement is referenced see if it is already exist */ + if (step->statement) + prepared = ActivateDatanodeStatementOnNode(step->statement, + connections[i]->nodenum); + /* + * execute and fetch rows only if they will be consumed + * immediately by the sorter + */ + if (step->cursor) + fetch = 1; + + if (pgxc_node_send_query_extended(connections[i], + prepared ? NULL : step->sql_statement, + step->statement, + step->cursor, + step->paramval_len, + step->paramval_data, + step->read_only, + fetch) != 0) { pfree(connections); if (primaryconnection) @@ -3175,14 +3276,11 @@ ExecRemoteQuery(RemoteQueryState *node) (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to data nodes"))); } - if (total_conn_count == 1 && pgxc_node_send_timestamp(connections[i], timestamp)) + } + else + { + if (pgxc_node_send_query(connections[i], step->sql_statement) != 0) { - /* - * If a transaction involves multiple connections timestamp is - * always sent down to Datanodes with pgxc_node_begin. - * An autocommit transaction needs the global timestamp also, - * so handle this case here. - */ pfree(connections); if (primaryconnection) pfree(primary... [truncated message content] |
From: Michael P. <mic...@us...> - 2010-12-22 05:58:53
|
Project "Postgres-XC". The branch, master has been updated via 0ab9bbc7600c157618d566f4d9985399e446519d (commit) from bb22b7d667c20228e23526c5627197c10ae54672 (commit) - Log ----------------------------------------------------------------- commit 0ab9bbc7600c157618d566f4d9985399e446519d Author: Michael P <mic...@us...> Date: Wed Dec 22 14:48:48 2010 +0900 Correction for implicit 2PC When a COMMIT is issued for a write transaction involving multiple Postgres-XC nodes, a 2PC is used internally. For implicit 2PC, the following process is respected for DDL transactions: 1) PREPARE on local Coordinator (if DDL is involved) 2) PREPARE on Postgres-XC nodes 3) COMMIT PREPARED on local Coordinator (if DDL is involved) 4) COMMIT PREPARED on Postgres-XC nodes For transaction containing no DDL: 1) PREPARE on Datanodes 2) COMMIT on Coordinator 3) COMMIT PREPARED on Datanodes In case of a Node failure after Coordinator has committed, transaction becomes partially committed on Nodes. To maintain data consistency, it is absolutely necessary to COMMIT this transaction on all nodes. In this case, the remaining list PREPARED nodes is saved on GTM as if it was an explicit 2PC. And this transaction is kept open to avoid visibility issues. It is necessary to issue a COMMIT PREPARED from application to finish the COMMIT of this transaction. diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 7465847..da9e3b1 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -26,7 +26,7 @@ #include "access/gtm.h" /* PGXC_COORD */ #include "gtm/gtm_c.h" -#include "pgxc/pgxcnode.h" +#include "pgxc/execRemote.h" /* PGXC_DATANODE */ #include "postmaster/autovacuum.h" #endif @@ -139,6 +139,9 @@ typedef struct TransactionStateData TransactionId transactionId; /* my XID, or Invalid if none */ #ifdef PGXC /* PGXC_COORD */ GlobalTransactionId globalTransactionId; /* my GXID, or Invalid if none */ + GlobalTransactionId globalCommitTransactionId; /* Commit GXID used by implicit 2PC */ + bool ArePGXCNodesPrepared; /* Checks if PGXC Nodes are prepared and + * rollbacks then in case of an Abort */ #endif SubTransactionId subTransactionId; /* my subxact ID */ char *name; /* savepoint name, if any */ @@ -169,6 +172,8 @@ static TransactionStateData TopTransactionStateData = { 0, /* transaction id */ #ifdef PGXC 0, /* global transaction id */ + 0, /* global commit transaction id */ + 0, /* flag if nodes are prepared or not */ #endif 0, /* subtransaction id */ NULL, /* savepoint name */ @@ -307,6 +312,7 @@ static const char *TransStateAsString(TransState state); #ifdef PGXC /* PGXC_COORD */ static GlobalTransactionId GetGlobalTransactionId(TransactionState s); +static void PrepareTransaction(bool write_2pc_file, bool is_implicit); /* ---------------------------------------------------------------- * PG-XC Functions @@ -1631,10 +1637,15 @@ StartTransaction(void) * start processing */ s->state = TRANS_START; -#ifdef PGXC /* PGXC_COORD */ +#ifdef PGXC /* GXID is assigned already by a remote Coordinator */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { s->globalTransactionId = InvalidGlobalTransactionId; /* until assigned */ + /* Until assigned by implicit 2PC */ + s->globalCommitTransactionId = InvalidGlobalTransactionId; + s->ArePGXCNodesPrepared = false; + } #endif s->transactionId = InvalidTransactionId; /* until assigned */ /* @@ -1737,7 +1748,31 @@ CommitTransaction(void) { TransactionState s = CurrentTransactionState; TransactionId latestXid; +#ifdef PGXC + bool PrepareLocalCoord = false; + bool PreparePGXCNodes = false; + char implicitgid[256]; + TransactionId xid = GetCurrentTransactionId(); + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + PreparePGXCNodes = PGXCNodeIsImplicit2PC(&PrepareLocalCoord); + + if (PrepareLocalCoord || PreparePGXCNodes) + sprintf(implicitgid, "T%d", xid); + + /* Save GID where PrepareTransaction can find it again */ + if (PrepareLocalCoord) + { + prepareGID = MemoryContextStrdup(TopTransactionContext, implicitgid); + /* + * If current transaction has a DDL, and involves more than 1 Coordinator, + * PREPARE first on local Coordinator. + */ + PrepareTransaction(true, true); + } + else + { +#endif ShowTransactionState("CommitTransaction"); /* @@ -1747,6 +1782,28 @@ CommitTransaction(void) elog(WARNING, "CommitTransaction while in %s state", TransStateAsString(s->state)); Assert(s->parent == NULL); +#ifdef PGXC + } + + /* + * If Transaction has involved several nodes, prepare them before committing on Coordinator. + */ + if (PreparePGXCNodes) + { + /* + * Prepare all the nodes involved in this Implicit 2PC + * If Coordinator COMMIT fails, nodes are also rollbacked during AbortTransaction(). + * + * Track if PGXC Nodes are already prepared + */ + if (PGXCNodeImplicitPrepare(xid, implicitgid) < 0) + ereport(ERROR, + (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), + errmsg("cannot COMMIT a transaction whose PREPARE has failed on Nodes"))); + else + s->ArePGXCNodesPrepared = true; + } +#endif /* * Do pre-commit processing (most of this stuff requires database access, @@ -1756,6 +1813,10 @@ CommitTransaction(void) * deferred triggers, and it's also possible that triggers create holdable * cursors. So we have to loop until there's nothing left to do. */ +#ifdef PGXC + if (!PrepareLocalCoord) + { +#endif for (;;) { /* @@ -1800,8 +1861,11 @@ CommitTransaction(void) /* * There can be error on the data nodes. So go to data nodes before * changing transaction state and local clean up + * Here simply commit on nodes, we know that 2PC is not involved implicitely. + * + * This is called only if it is not necessary to prepare the nodes. */ - if (IS_PGXC_COORDINATOR) + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && !PreparePGXCNodes) PGXCNodeCommit(); #endif @@ -1825,8 +1889,10 @@ CommitTransaction(void) /* * Now we can let GTM know about transaction commit. * Only a Remote Coordinator is allowed to do that. + * + * Also do not commit a transaction that has already been prepared on Datanodes */ - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && !PreparePGXCNodes) { CommitTranGTM(s->globalTransactionId); latestXid = s->globalTransactionId; @@ -1908,6 +1974,46 @@ CommitTransaction(void) AtEOXact_MultiXact(); +#ifdef PGXC + }/* End of !PrepareLocalCoord */ + + /* + * At this point, if no 2pc has been used, we have a transaction that committed on GTM, + * local coord and nodes, so the remaining stuff is only ressource cleanup. + * If 2pc has been used, Coordinator has been prepared (if 2 Coordinators at least are involved + * in current transaction). + * Datanodes have also been prepared if more than 1 Datanode has been written. + * + * Here we complete Implicit 2PC in the following order + * - Commit the prepared transaction on local coordinator (if necessary) + * - Commit on the remaining nodes + */ + + if (PreparePGXCNodes) + { + /* + * Preparing for Commit, transaction has to take a new TransactionID for Commit + * It is considered as in Progress state. + */ + s->state = TRANS_INPROGRESS; + s->globalCommitTransactionId = BeginTranGTM(NULL); + + /* COMMIT local Coordinator */ + if (PrepareLocalCoord) + { + FinishPreparedTransaction(implicitgid, true); + } + + /* + * Commit all the nodes involved in this implicit 2PC. + * COMMIT on GTM is made here and is made at the same time + * for prepared GXID and commit GXID to limit interactions between GTM and Coord. + * This explains why prepared GXID is also in argument. + */ + PGXCNodeImplicitCommitPrepared(xid, s->globalCommitTransactionId, implicitgid, true); + } +#endif + ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, true, true); @@ -1948,7 +2054,11 @@ CommitTransaction(void) #ifdef PGXC if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { s->globalTransactionId = InvalidGlobalTransactionId; + s->globalCommitTransactionId = InvalidGlobalTransactionId; + s->ArePGXCNodesPrepared = false; + } else if (IS_PGXC_DATANODE || IsConnFromCoord()) SetNextTransactionId(InvalidTransactionId); #endif @@ -1972,9 +2082,11 @@ CommitTransaction(void) /* * Only a Postgres-XC Coordinator that received a PREPARE Command from * an application can use this special prepare. + * If PrepareTransaction is called during an implicit 2PC, do not release ressources, + * this is made by CommitTransaction when transaction has been committed on Nodes. */ static void -PrepareTransaction(bool write_2pc_file) +PrepareTransaction(bool write_2pc_file, bool is_implicit) #else static void PrepareTransaction(void) @@ -2170,6 +2282,14 @@ PrepareTransaction(void) } #endif +#ifdef PGXC + /* + * In case of an implicit 2PC, ressources are released by CommitTransaction() + */ + if (!is_implicit) + { +#endif + ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, true, true); @@ -2219,6 +2339,9 @@ PrepareTransaction(void) s->state = TRANS_DEFAULT; RESUME_INTERRUPTS(); +#ifdef PGXC + } /* is_implicit END */ +#endif } @@ -2286,8 +2409,13 @@ AbortTransaction(void) /* * We should rollback on the data nodes before cleaning up portals * to be sure data structures used by connections are not freed yet + * + * It is also necessary to check that node are not partially committed + * in an implicit 2PC, correct handling is made below. */ - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + if (IS_PGXC_COORDINATOR && + !IsConnFromCoord() && + !TransactionIdIsValid(s->globalCommitTransactionId)) { /* * Make sure this is rolled back on the DataNodes @@ -2310,6 +2438,10 @@ AbortTransaction(void) * Advertise the fact that we aborted in pg_clog (assuming that we got as * far as assigning an XID to advertise). */ +#ifdef PGXC + /* Do not abort a transaction that has already been committed in an implicit 2PC */ + if (!TransactionIdIsValid(s->globalCommitTransactionId)) +#endif latestXid = RecordTransactionAbort(false); TRACE_POSTGRESQL_TRANSACTION_ABORT(MyProc->lxid); @@ -2317,8 +2449,56 @@ AbortTransaction(void) /* This is done by remote Coordinator */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) { - RollbackTranGTM(s->globalTransactionId); + /* + * Rollback the transaction ID only if it is not being used by an implicit 2PC. + */ + if (!s->ArePGXCNodesPrepared) + RollbackTranGTM(s->globalTransactionId); + latestXid = s->globalTransactionId; + + /* Rollback Prepared Nodes if they are totally prepared but not committed at all */ + if (s->ArePGXCNodesPrepared && !TransactionIdIsValid(s->globalCommitTransactionId)) + { + char implicitgid[256]; + + sprintf(implicitgid, "T%d", s->globalTransactionId); + PGXCNodeImplicitCommitPrepared(s->globalTransactionId, + s->globalCommitTransactionId, + implicitgid, false); + } + else if (s->ArePGXCNodesPrepared && TransactionIdIsValid(s->globalCommitTransactionId)) + { + /* + * In this case transaction is partially committed, pick up the list of nodes + * prepared and not committed and register them on GTM as if it is an explicit 2PC. + * This permits to keep the transaction alive in snapshot and other transaction + * don't have any side effects with partially committed transactions + */ + char implicitgid[256]; + int co_conn_count, dn_conn_count; + PGXC_NodeId *datanodes = NULL; + PGXC_NodeId *coordinators = NULL; + + sprintf(implicitgid, "T%d", s->globalTransactionId); + + /* Get the list of nodes in error state */ + PGXCNodeGetNodeList(&datanodes, &dn_conn_count, &coordinators, &co_conn_count); + + /* Save the node list and gid on GTM. */ + StartPreparedTranGTM(s->globalTransactionId, implicitgid, + dn_conn_count, datanodes, co_conn_count, coordinators); + + /* Finish to prepare the transaction. */ + PrepareTranGTM(s->globalTransactionId); + + /* + * Rollback commit GXID as it has been used by an implicit 2PC. + * It is important at this point not to Commit the GXID used for PREPARE + * to keep it visible in snapshot for other transactions. + */ + RollbackTranGTM(s->globalCommitTransactionId); + } } else if (IS_PGXC_DATANODE || IsConnFromCoord()) { @@ -2602,7 +2782,7 @@ CommitTransactionCommand(void) * return to the idle state. */ case TBLOCK_PREPARE: - PrepareTransaction(true); + PrepareTransaction(true, false); s->blockState = TBLOCK_DEFAULT; break; @@ -2612,7 +2792,7 @@ CommitTransactionCommand(void) * that involved DDLs on a Coordinator. */ case TBLOCK_PREPARE_NO_2PC_FILE: - PrepareTransaction(false); + PrepareTransaction(false, false); s->blockState = TBLOCK_DEFAULT; break; #endif @@ -2647,17 +2827,20 @@ CommitTransactionCommand(void) CommitTransaction(); s->blockState = TBLOCK_DEFAULT; } -#ifdef PGXC - else if (s->blockState == TBLOCK_PREPARE || - s->blockState == TBLOCK_PREPARE_NO_2PC_FILE) -#else else if (s->blockState == TBLOCK_PREPARE) -#endif { Assert(s->parent == NULL); - PrepareTransaction(true); + PrepareTransaction(true, false); s->blockState = TBLOCK_DEFAULT; } +#ifdef PGXC + else if (s->blockState == TBLOCK_PREPARE_NO_2PC_FILE) + { + Assert(s->parent == NULL); + PrepareTransaction(false, false); + s->blockState = TBLOCK_DEFAULT; + } +#endif else { Assert(s->blockState == TBLOCK_INPROGRESS || diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index a524c13..b954003 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -49,6 +49,7 @@ extern char *deparseSql(RemoteQueryState *scanstate); #define PRIMARY_NODE_WRITEAHEAD 1024 * 1024 static bool autocommit = true; +static bool implicit_force_autocommit = false; static PGXCNodeHandle **write_node_list = NULL; static int write_node_count = 0; @@ -61,6 +62,14 @@ static int pgxc_node_rollback_prepared(GlobalTransactionId gxid, GlobalTransacti PGXCNodeAllHandles * pgxc_handles, char *gid); static int pgxc_node_commit_prepared(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid, PGXCNodeAllHandles * pgxc_handles, char *gid); +static int pgxc_node_implicit_commit_prepared(GlobalTransactionId prepare_xid, + GlobalTransactionId commit_xid, + PGXCNodeAllHandles * pgxc_handles, + char *gid, + bool is_commit); +static int pgxc_node_implicit_prepare(GlobalTransactionId prepare_xid, + PGXCNodeAllHandles * pgxc_handles, char *gid); + static PGXCNodeAllHandles * get_exec_connections(ExecNodes *exec_nodes, RemoteQueryExecType exec_type); static int pgxc_node_receive_and_validate(const int conn_count, @@ -74,7 +83,7 @@ static int handle_response_clear(PGXCNodeHandle * conn); static void close_node_cursors(PGXCNodeHandle **connections, int conn_count, char *cursor); -static PGXCNodeAllHandles *pgxc_get_all_transaction_nodes(void); +static PGXCNodeAllHandles *pgxc_get_all_transaction_nodes(PGXCNode_HandleRequested status_requested); #define MAX_STATEMENTS_PER_TRAN 10 @@ -1505,7 +1514,7 @@ PGXCNodePrepare(char *gid) PGXCNodeAllHandles *pgxc_connections; bool local_operation = false; - pgxc_connections = pgxc_get_all_transaction_nodes(); + pgxc_connections = pgxc_get_all_transaction_nodes(HANDLE_DEFAULT); /* DDL involved in transaction, so make a local prepare too */ if (pgxc_connections->co_conn_count != 0) @@ -1669,6 +1678,176 @@ finish: return result; } +/* + * Prepare all the nodes involved in this implicit Prepare + * Abort transaction if this is not done correctly + */ +int +PGXCNodeImplicitPrepare(GlobalTransactionId prepare_xid, char *gid) +{ + int res = 0; + int tran_count; + PGXCNodeAllHandles *pgxc_connections = pgxc_get_all_transaction_nodes(HANDLE_DEFAULT); + + if (!pgxc_connections) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not prepare connection implicitely"))); + + tran_count = pgxc_connections->dn_conn_count + pgxc_connections->co_conn_count; + + /* + * This should not happen because an implicit 2PC is always using other nodes, + * but it is better to check. + */ + if (tran_count == 0) + { + goto finish; + } + + res = pgxc_node_implicit_prepare(prepare_xid, pgxc_connections, gid); + +finish: + if (!autocommit) + stat_transaction(pgxc_connections->dn_conn_count); + + return res; +} + +/* + * Prepare transaction on dedicated nodes for Implicit 2PC + * This is done inside a Transaction commit if multiple nodes are involved in write operations + * Implicit prepare in done internally on Coordinator, so this does not interact with GTM. + */ +static int +pgxc_node_implicit_prepare(GlobalTransactionId prepare_xid, + PGXCNodeAllHandles *pgxc_handles, + char *gid) +{ + int result = 0; + int co_conn_count = pgxc_handles->co_conn_count; + int dn_conn_count = pgxc_handles->dn_conn_count; + char buffer[256]; + + sprintf(buffer, "PREPARE TRANSACTION '%s'", gid); + + /* Continue even after an error here, to consume the messages */ + result = pgxc_all_handles_send_query(pgxc_handles, buffer, true); + + /* Receive and Combine results from Datanodes and Coordinators */ + result |= pgxc_node_receive_and_validate(dn_conn_count, pgxc_handles->datanode_handles, false); + result |= pgxc_node_receive_and_validate(co_conn_count, pgxc_handles->coord_handles, false); + + return result; +} + +/* + * Commit all the nodes involved in this Implicit Commit. + * Prepared XID is committed at the same time as Commit XID on GTM. + */ +void +PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, + GlobalTransactionId commit_xid, + char *gid, + bool is_commit) +{ + int res = 0; + int tran_count; + PGXCNodeAllHandles *pgxc_connections = pgxc_get_all_transaction_nodes(HANDLE_IDLE); + + if (!pgxc_connections) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not commit prepared transaction implicitely"))); + + tran_count = pgxc_connections->dn_conn_count + pgxc_connections->co_conn_count; + + /* + * This should not happen because an implicit 2PC is always using other nodes, + * but it is better to check. + */ + if (tran_count == 0) + { + elog(WARNING, "Nothing to PREPARE on Datanodes and Coordinators"); + goto finish; + } + + res = pgxc_node_implicit_commit_prepared(prepare_xid, commit_xid, + pgxc_connections, gid, is_commit); + +finish: + /* Clear nodes, signals are clear */ + if (!autocommit) + stat_transaction(pgxc_connections->dn_conn_count); + + /* + * If an error happened, do not release handles yet. This is done when transaction + * is aborted after the list of nodes in error state has been saved to be sent to GTM + */ + if (!PersistentConnections && res == 0) + release_handles(false); + autocommit = true; + clear_write_node_list(); + + if (res != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not commit prepared transaction implicitely"))); + + /* + * Commit on GTM is made once we are sure that Nodes are not only partially committed + * If an error happens on a Datanode during implicit COMMIT PREPARED, a special handling + * is made in AbortTransaction(). + * The list of datanodes is saved on GTM and the partially committed transaction can be committed + * with a COMMIT PREPARED delivered directly from application. + * This permits to keep the gxid alive in snapshot and avoids other transactions to see only + * partially committed results. + */ + CommitPreparedTranGTM(prepare_xid, commit_xid); +} + +/* + * Commit a transaction implicitely transaction on all nodes + * Prepared transaction with this gid has reset the datanodes, + * so we need a new gxid. + * + * GXID used for Prepare and Commit are committed at the same time on GTM. + * This saves Network ressource a bit. + */ +static int +pgxc_node_implicit_commit_prepared(GlobalTransactionId prepare_xid, + GlobalTransactionId commit_xid, + PGXCNodeAllHandles *pgxc_handles, + char *gid, + bool is_commit) +{ + char buffer[256]; + int result = 0; + int co_conn_count = pgxc_handles->co_conn_count; + int dn_conn_count = pgxc_handles->dn_conn_count; + + if (is_commit) + sprintf(buffer, "COMMIT PREPARED '%s'", gid); + else + sprintf(buffer, "ROLLBACK PREPARED '%s'", gid); + + if (pgxc_all_handles_send_gxid(pgxc_handles, commit_xid, true)) + { + result = EOF; + goto finish; + } + + /* Send COMMIT to all handles */ + if (pgxc_all_handles_send_query(pgxc_handles, buffer, false)) + result = EOF; + + /* Receive and Combine results from Datanodes and Coordinators */ + result |= pgxc_node_receive_and_validate(dn_conn_count, pgxc_handles->datanode_handles, false); + result |= pgxc_node_receive_and_validate(co_conn_count, pgxc_handles->coord_handles, false); + +finish: + return result; +} /* * Commit prepared transaction on Datanodes and Coordinators (as necessary) @@ -1684,7 +1863,7 @@ PGXCNodeCommitPrepared(char *gid) { int res = 0; int res_gtm = 0; - PGXCNodeAllHandles *pgxc_handles; + PGXCNodeAllHandles *pgxc_handles = NULL; List *datanodelist = NIL; List *coordlist = NIL; int i, tran_count; @@ -1812,7 +1991,7 @@ PGXCNodeRollbackPrepared(char *gid) { int res = 0; int res_gtm = 0; - PGXCNodeAllHandles *pgxc_handles; + PGXCNodeAllHandles *pgxc_handles = NULL; List *datanodelist = NIL; List *coordlist = NIL; int i, tran_count; @@ -1922,6 +2101,8 @@ pgxc_node_rollback_prepared(GlobalTransactionId gxid, GlobalTransactionId prepar /* * Commit current transaction on data nodes where it has been started + * This function is called when no 2PC is involved implicitely. + * So only send a commit to the involved nodes. */ void PGXCNodeCommit(void) @@ -1930,7 +2111,7 @@ PGXCNodeCommit(void) int tran_count; PGXCNodeAllHandles *pgxc_connections; - pgxc_connections = pgxc_get_all_transaction_nodes(); + pgxc_connections = pgxc_get_all_transaction_nodes(HANDLE_DEFAULT); tran_count = pgxc_connections->dn_conn_count + pgxc_connections->co_conn_count; @@ -1952,7 +2133,7 @@ finish: autocommit = true; clear_write_node_list(); - /* Clear up connection */ + /* Clean up connections */ pfree_pgxc_all_handles(pgxc_connections); if (res != 0) ereport(ERROR, @@ -1969,71 +2150,11 @@ static int pgxc_node_commit(PGXCNodeAllHandles *pgxc_handles) { char buffer[256]; - GlobalTransactionId gxid = InvalidGlobalTransactionId; int result = 0; int co_conn_count = pgxc_handles->co_conn_count; int dn_conn_count = pgxc_handles->dn_conn_count; - /* can set this to false to disable temporarily */ - /* bool do2PC = conn_count > 1; */ - - /* - * Only use 2PC if more than one node was written to. Otherwise, just send - * COMMIT to all - */ - bool do2PC = write_node_count > 1; - - /* Extra XID for Two Phase Commit */ - GlobalTransactionId two_phase_xid = 0; - - if (do2PC) - { - stat_2pc(); - - /* - * Formally we should be using GetCurrentGlobalTransactionIdIfAny() here, - * but since we need 2pc, we surely have sent down a command and got - * gxid for it. Hence GetCurrentGlobalTransactionId() just returns - * already allocated gxid - */ - gxid = GetCurrentGlobalTransactionId(); - - sprintf(buffer, "PREPARE TRANSACTION 'T%d'", gxid); - - if (pgxc_all_handles_send_query(pgxc_handles, buffer, false)) - result = EOF; - - /* Receive and Combine results from Datanodes and Coordinators */ - result |= pgxc_node_receive_and_validate(dn_conn_count, pgxc_handles->datanode_handles, true); - result |= pgxc_node_receive_and_validate(co_conn_count, pgxc_handles->coord_handles, true); - } - - if (!do2PC) - strcpy(buffer, "COMMIT"); - else - { - if (result) - { - sprintf(buffer, "ROLLBACK PREPARED 'T%d'", gxid); - /* Consume any messages on the Datanodes and Coordinators first if necessary */ - PGXCNodeConsumeMessages(); - } - else - sprintf(buffer, "COMMIT PREPARED 'T%d'", gxid); - - /* - * We need to use a new xid, the data nodes have reset - * Timestamp has already been set with BEGIN on remote Datanodes, - * so don't use it here. - */ - two_phase_xid = BeginTranGTM(NULL); - - if (pgxc_all_handles_send_gxid(pgxc_handles, two_phase_xid, true)) - { - result = EOF; - goto finish; - } - } + strcpy(buffer, "COMMIT"); /* Send COMMIT to all handles */ if (pgxc_all_handles_send_query(pgxc_handles, buffer, false)) @@ -2043,10 +2164,6 @@ pgxc_node_commit(PGXCNodeAllHandles *pgxc_handles) result |= pgxc_node_receive_and_validate(dn_conn_count, pgxc_handles->datanode_handles, false); result |= pgxc_node_receive_and_validate(co_conn_count, pgxc_handles->coord_handles, false); -finish: - if (do2PC) - CommitTranGTM((GlobalTransactionId) two_phase_xid); - return result; } @@ -2062,7 +2179,7 @@ PGXCNodeRollback(void) int tran_count; PGXCNodeAllHandles *pgxc_connections; - pgxc_connections = pgxc_get_all_transaction_nodes(); + pgxc_connections = pgxc_get_all_transaction_nodes(HANDLE_DEFAULT); tran_count = pgxc_connections->dn_conn_count + pgxc_connections->co_conn_count; @@ -2099,7 +2216,6 @@ finish: static int pgxc_node_rollback(PGXCNodeAllHandles *pgxc_handles) { - int i; int result = 0; int co_conn_count = pgxc_handles->co_conn_count; int dn_conn_count = pgxc_handles->dn_conn_count; @@ -2881,6 +2997,8 @@ ExecRemoteQuery(RemoteQueryState *node) PGXCNodeAllHandles *pgxc_connections; TupleTableSlot *innerSlot = NULL; + implicit_force_autocommit = force_autocommit; + /* * Inner plan for RemoteQuery supplies parameters. * We execute inner plan to get a tuple and use values of the tuple as @@ -3622,6 +3740,8 @@ ExecRemoteUtility(RemoteQuery *node) bool need_tran; int i; + implicit_force_autocommit = force_autocommit; + remotestate = CreateResponseCombiner(0, node->combine_type); pgxc_connections = get_exec_connections(node->exec_nodes, @@ -3984,7 +4104,7 @@ finish: * for both data nodes and coordinators */ static PGXCNodeAllHandles * -pgxc_get_all_transaction_nodes() +pgxc_get_all_transaction_nodes(PGXCNode_HandleRequested status_requested) { PGXCNodeAllHandles *pgxc_connections; @@ -4009,9 +4129,13 @@ pgxc_get_all_transaction_nodes() /* gather needed connections */ pgxc_connections->dn_conn_count = get_transaction_nodes( - pgxc_connections->datanode_handles, REMOTE_CONN_DATANODE); + pgxc_connections->datanode_handles, + REMOTE_CONN_DATANODE, + status_requested); pgxc_connections->co_conn_count = get_transaction_nodes( - pgxc_connections->coord_handles, REMOTE_CONN_COORD); + pgxc_connections->coord_handles, + REMOTE_CONN_COORD, + status_requested); return pgxc_connections; } @@ -4032,3 +4156,68 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles) pfree(pgxc_handles); } + +/* + * Check if an Implicit 2PC is necessary for this transaction. + * Check also if it is necessary to prepare transaction locally. + */ +bool +PGXCNodeIsImplicit2PC(bool *prepare_local_coord) +{ + PGXCNodeAllHandles *pgxc_handles = pgxc_get_all_transaction_nodes(HANDLE_DEFAULT); + int co_conn_count = pgxc_handles->co_conn_count; + + /* Prepare Local Coord only if DDL is involved on multiple nodes */ + *prepare_local_coord = co_conn_count > 0; + + /* + * In case of an autocommit or forced autocommit transaction, 2PC is not involved + * This case happens for Utilities using force autocommit (CREATE DATABASE, VACUUM...) + */ + if (implicit_force_autocommit) + { + implicit_force_autocommit = false; + return false; + } + + /* + * 2PC is necessary at other Nodes if one Datanode or one Coordinator + * other than the local one has been involved in a write operation. + */ + return (write_node_count > 1 || co_conn_count > 0); +} + +/* + * Return the list of active nodes + */ +void +PGXCNodeGetNodeList(PGXC_NodeId **datanodes, + int *dn_conn_count, + PGXC_NodeId **coordinators, + int *co_conn_count) +{ + PGXCNodeAllHandles *pgxc_connections = pgxc_get_all_transaction_nodes(HANDLE_ERROR); + + *dn_conn_count = pgxc_connections->dn_conn_count; + + /* Add in the list local coordinator also if necessary */ + if (pgxc_connections->co_conn_count == 0) + *co_conn_count = pgxc_connections->co_conn_count; + else + *co_conn_count = pgxc_connections->co_conn_count + 1; + + if (pgxc_connections->dn_conn_count != 0) + *datanodes = collect_pgxcnode_numbers(pgxc_connections->dn_conn_count, + pgxc_connections->datanode_handles, REMOTE_CONN_DATANODE); + + if (pgxc_connections->co_conn_count != 0) + *coordinators = collect_pgxcnode_numbers(pgxc_connections->co_conn_count, + pgxc_connections->coord_handles, REMOTE_CONN_COORD); + + /* + * Now release handles properly, the list of handles in error state has been saved + * and will be sent to GTM. + */ + if (!PersistentConnections) + release_handles(false); +} diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index cbaf68c..4790f95 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -1579,9 +1579,19 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query) * to a PGXCNodeHandle structure. * The function returns number of pointers written to the connections array. * Remaining items in the array, if any, will be kept unchanged + * + * In an implicit 2PC, status of connections is set back to idle after preparing + * the transaction on each backend. + * At commit phase, it is necessary to get backends in idle state to be able to + * commit properly the backends. + * + * In the case of an error occuring with an implicit 2PC that has been partially + * committed on nodes, return the list of connections that has an error state + * to register the list of remaining nodes not commit prepared on GTM. */ int -get_transaction_nodes(PGXCNodeHandle **connections, char client_conn_type) +get_transaction_nodes(PGXCNodeHandle **connections, char client_conn_type, + PGXCNode_HandleRequested status_requested) { int tran_count = 0; int i; @@ -1596,16 +1606,42 @@ get_transaction_nodes(PGXCNodeHandle **connections, char client_conn_type) * DN_CONNECTION_STATE_ERROR_FATAL. * ERROR_NOT_READY can happen if the data node abruptly disconnects. */ - if (dn_handles[i].sock != NO_SOCKET && dn_handles[i].transaction_status != 'I') - connections[tran_count++] = &dn_handles[i]; + if (status_requested == HANDLE_IDLE) + { + if (dn_handles[i].sock != NO_SOCKET && dn_handles[i].transaction_status == 'I') + connections[tran_count++] = &dn_handles[i]; + } + else if (status_requested == HANDLE_ERROR) + { + if (dn_handles[i].transaction_status == 'E') + connections[tran_count++] = &dn_handles[i]; + } + else + { + if (dn_handles[i].sock != NO_SOCKET && dn_handles[i].transaction_status != 'I') + connections[tran_count++] = &dn_handles[i]; + } } } if (coord_count && client_conn_type == REMOTE_CONN_COORD) { for (i = 0; i < NumCoords; i++) { - if (co_handles[i].sock != NO_SOCKET && co_handles[i].transaction_status != 'I') - connections[tran_count++] = &co_handles[i]; + if (status_requested == HANDLE_IDLE) + { + if (co_handles[i].sock != NO_SOCKET && co_handles[i].transaction_status == 'I') + connections[tran_count++] = &co_handles[i]; + } + else if (status_requested == HANDLE_ERROR) + { + if (co_handles[i].transaction_status == 'E') + connections[tran_count++] = &co_handles[i]; + } + else + { + if (co_handles[i].sock != NO_SOCKET && co_handles[i].transaction_status != 'I') + connections[tran_count++] = &co_handles[i]; + } } } diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 6000fdb..69c25d1 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -280,6 +280,14 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) * taking a snapshot. See discussion in * src/backend/access/transam/README. */ +#ifdef PGXC + /* + * Remove this assertion check for PGXC on Coordinator + * We could abort even after a Coordinator has committed + * for a 2PC transaction if Datanodes have failed committed the transaction + */ + if (IS_PGXC_DATANODE) +#endif Assert(TransactionIdIsValid(proc->xid)); LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 4a33842..a3c1868 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -43,6 +43,7 @@ typedef enum REQUEST_TYPE_COPY_OUT /* Copy Out response */ } RequestType; + /* * Represents a DataRow message received from a remote node. * Contains originating node number and message body in DataRow format without @@ -111,6 +112,19 @@ extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); extern bool PGXCNodeRollbackPrepared(char *gid); extern bool PGXCNodeCommitPrepared(char *gid); +extern bool PGXCNodeIsImplicit2PC(bool *prepare_local_coord); +extern int PGXCNodeImplicitPrepare(GlobalTransactionId prepare_xid, char *gid); +extern void PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, + GlobalTransactionId commit_xid, + char *gid, + bool is_commit); +extern void PGXCNodeConsumeMessages(void); + +/* Get list of nodes */ +extern void PGXCNodeGetNodeList(PGXC_NodeId **datanodes, + int *dn_conn_count, + PGXC_NodeId **coordinators, + int *co_conn_count); /* Copy command just involves Datanodes */ extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_from); diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index a57e4f1..47b0b96 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -39,6 +39,13 @@ typedef enum DN_CONNECTION_STATE_COPY_OUT } DNConnectionState; +typedef enum +{ + HANDLE_IDLE, + HANDLE_ERROR, + HANDLE_DEFAULT +} PGXCNode_HandleRequested; + #define DN_CONNECTION_STATE_ERROR(dnconn) \ ((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \ || (dnconn)->transaction_status == 'E') @@ -97,7 +104,9 @@ extern void PGXCNodeCleanAndRelease(int code, Datum arg); extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only); extern void release_handles(bool force_drop); -extern int get_transaction_nodes(PGXCNodeHandle ** connections, char client_conn_type); +extern int get_transaction_nodes(PGXCNodeHandle ** connections, + char client_conn_type, + PGXCNode_HandleRequested type_requested); extern PGXC_NodeId* collect_pgxcnode_numbers(int conn_count, PGXCNodeHandle ** connections, char client_conn_type); extern int get_active_nodes(PGXCNodeHandle ** connections); ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/xact.c | 213 ++++++++++++++++++++-- src/backend/pgxc/pool/execRemote.c | 341 +++++++++++++++++++++++++++-------- src/backend/pgxc/pool/pgxcnode.c | 46 +++++- src/backend/storage/ipc/procarray.c | 8 + src/include/pgxc/execRemote.h | 14 ++ src/include/pgxc/pgxcnode.h | 11 +- 6 files changed, 536 insertions(+), 97 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-12-21 21:06:15
|
Project "Postgres-XC". The branch, master has been updated via bb22b7d667c20228e23526c5627197c10ae54672 (commit) from ce57f6ba483c2cb76a96fe32f3850e1eac4dfdd6 (commit) - Log ----------------------------------------------------------------- commit bb22b7d667c20228e23526c5627197c10ae54672 Author: Mason Sharp <ma...@us...> Date: Tue Dec 21 16:05:07 2010 -0500 Fixed recently introduced bug with node determination for write operations. Extracted from Andrei Martsinchyk's patch diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 436a1dd..62cb748 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2320,7 +2320,7 @@ CopyFrom(CopyState cstate) if (DataNodeCopyIn(cstate->line_buf.data, cstate->line_buf.len, GetRelationNodes(cstate->rel_loc, (long *)hash_value, - RELATION_ACCESS_WRITE), + RELATION_ACCESS_INSERT), cstate->connections)) ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), @@ -4023,7 +4023,7 @@ DoInsertSelectCopy(EState *estate, TupleTableSlot *slot) /* Send item to the appropriate data node(s) (buffer) */ if (DataNodeCopyIn(cstate->fe_msgbuf->data, cstate->fe_msgbuf->len, - GetRelationNodes(cstate->rel_loc, (long *)hash_value, RELATION_ACCESS_WRITE), + GetRelationNodes(cstate->rel_loc, (long *)hash_value, RELATION_ACCESS_INSERT), cstate->connections)) ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 4191455..790b81d 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -299,7 +299,8 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, { case LOCATOR_TYPE_REPLICATED: - if (accessType == RELATION_ACCESS_WRITE) + if (accessType == RELATION_ACCESS_UPDATE || + accessType == RELATION_ACCESS_INSERT) { /* we need to write to all synchronously */ exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); @@ -360,7 +361,7 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, else { /* If no info, go to node 1 */ - if (accessType == RELATION_ACCESS_WRITE) + if (accessType == RELATION_ACCESS_INSERT) exec_nodes->nodelist = lappend_int(NULL, 1); else /* @@ -380,7 +381,7 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, case LOCATOR_TYPE_RROBIN: /* round robin, get next one */ - if (accessType == RELATION_ACCESS_WRITE) + if (accessType == RELATION_ACCESS_INSERT) { /* write to just one of them */ exec_nodes->nodelist = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid)); diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index fa61826..8d900f1 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -550,7 +550,7 @@ get_plan_nodes_insert(Query *query, RemoteQuery *step) if (!lc) { /* Skip rest, handle NULL */ - step->exec_nodes = GetRelationNodes(rel_loc_info, NULL, RELATION_ACCESS_WRITE); + step->exec_nodes = GetRelationNodes(rel_loc_info, NULL, RELATION_ACCESS_INSERT); return; } @@ -629,7 +629,7 @@ get_plan_nodes_insert(Query *query, RemoteQuery *step) /* single call handles both replicated and partitioned types */ step->exec_nodes = GetRelationNodes(rel_loc_info, part_value_ptr, - RELATION_ACCESS_WRITE); + RELATION_ACCESS_INSERT); if (eval_expr) pfree(eval_expr); @@ -1771,7 +1771,7 @@ get_plan_nodes_command(Query *query, RemoteQuery *step) case CMD_UPDATE: case CMD_DELETE: /* treat as a select */ - get_plan_nodes(query, step, RELATION_ACCESS_WRITE); + get_plan_nodes(query, step, RELATION_ACCESS_UPDATE); break; default: diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index b01606f..ee28c5a 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -31,6 +31,27 @@ typedef int PartAttrNumber; +/* track if tables use pg_catalog */ +typedef enum +{ + TABLE_USAGE_TYPE_NO_TABLE, + TABLE_USAGE_TYPE_PGCATALOG, + TABLE_USAGE_TYPE_USER, + TABLE_USAGE_TYPE_USER_REPLICATED, /* based on a replicated table */ + TABLE_USAGE_TYPE_MIXED +} TableUsageType; + +/* + * How relation is accessed in the query + */ +typedef enum +{ + RELATION_ACCESS_READ, /* SELECT */ + RELATION_ACCESS_READ_FOR_UPDATE, /* SELECT FOR UPDATE */ + RELATION_ACCESS_UPDATE, /* UPDATE OR DELETE */ + RELATION_ACCESS_INSERT /* INSERT */ +} RelationAccessType; + typedef struct { Oid relid; @@ -42,17 +63,6 @@ typedef struct ListCell *roundRobinNode; /* points to next one to use */ } RelationLocInfo; - -/* track if tables use pg_catalog */ -typedef enum -{ - TABLE_USAGE_TYPE_NO_TABLE, - TABLE_USAGE_TYPE_PGCATALOG, - TABLE_USAGE_TYPE_USER, - TABLE_USAGE_TYPE_USER_REPLICATED, /* based on a replicated table */ - TABLE_USAGE_TYPE_MIXED -} TableUsageType; - /* * Nodes to execute on * primarynodelist is for replicated table writes, where to execute first. @@ -68,15 +78,6 @@ typedef struct TableUsageType tableusagetype; /* track pg_catalog usage */ } ExecNodes; -/* - * How relation is accessed in the query - */ -typedef enum -{ - RELATION_ACCESS_READ, - RELATION_ACCESS_READ_FOR_UPDATE, - RELATION_ACCESS_WRITE -} RelationAccessType; extern char *PreferredDataNodes; ----------------------------------------------------------------------- Summary of changes: src/backend/commands/copy.c | 4 +- src/backend/pgxc/locator/locator.c | 7 +++-- src/backend/pgxc/plan/planner.c | 6 ++-- src/include/pgxc/locator.h | 41 ++++++++++++++++++----------------- 4 files changed, 30 insertions(+), 28 deletions(-) hooks/post-receive -- Postgres-XC |
From: Koichi S. <koi...@us...> - 2010-12-16 09:25:40
|
Project "website". The branch, master has been updated via ed16e38e3decb21b3cdc0a51b9a26ccb1d4c9cc6 (commit) from 950cd623ce6a04e4a7b1b10b9fca80dfabb10805 (commit) - Log ----------------------------------------------------------------- commit ed16e38e3decb21b3cdc0a51b9a26ccb1d4c9cc6 Author: Koichi Suzuki <ko...@in...> Date: Thu Dec 16 18:28:32 2010 +0900 modified: roadmap.html diff --git a/roadmap.html b/roadmap.html index 50aa661..d2fbece 100755 --- a/roadmap.html +++ b/roadmap.html @@ -84,26 +84,16 @@ subqueries<br> </p> --> <!-- ==== For Version 1.0 ==== --> -<h4> -Version 1.0 (Late in December, 2010) +<h4> +<!-- Version 1.0 (Late in December, 2010) --> +Version 1.0 (March, 2011) </h4> <p class="inner"> -Physical backup/restore incl. PITR<br /> Cross-node oepration optimization<br /> -More variety of statements such as <code>SELECT</code> in <code>INSERT</code><br /> -Full support Prepared statements and cluster-wide recovery<br /> +More variety of statements.<br /> HA Capability<br /> -General aggregate functions<br /> -Savepoint<br /> -Session Parameters<br /> -Forward cursor with <code>ORDER BY</code><br /> -Backward cursor<br /> -Batch, statement pushdown<br /> -Global constraints<br /> -Tuple relocation (distrubute key update)<br /> -Performance improvement <br /> -Regression tests +Trigger<br /> </p> <!-- === Beyond Version 1.0 === ---> @@ -112,9 +102,17 @@ Beyond Version 1.0 </h4> <p class="inner"> -HA Capability<br /> +PITR cluster-wide recovery<br /> +Multi-step Prepared statments<br /> +More variety of statements, such as <code>SELECT</code> in <code>INSERT</code><br /> GTM-Standby<br /> -Trigger<br /> +Savepoint<br /> +Session Parameters<br /> +Backward cursor<br /> +Batch, statement pushdown<br /> +Global constraints<br /> +Tuple relocation (distrubute key update)<br /> +Regression tests<br /> </p> </body> ----------------------------------------------------------------------- Summary of changes: roadmap.html | 32 +++++++++++++++----------------- 1 files changed, 15 insertions(+), 17 deletions(-) hooks/post-receive -- website |
From: Michael P. <mic...@us...> - 2010-12-14 01:12:04
|
Project "Postgres-XC". The branch, master has been updated via ce57f6ba483c2cb76a96fe32f3850e1eac4dfdd6 (commit) from 75fbef774e81432cdd5ff4eeabf203b12be560a9 (commit) - Log ----------------------------------------------------------------- commit ce57f6ba483c2cb76a96fe32f3850e1eac4dfdd6 Author: Michael P <mic...@us...> Date: Tue Dec 14 10:09:29 2010 +0900 Change Protocol connection between PGXC nodes and GTM/GTM-Proxy. This patch uses several node types to identify what is connecting on GTM PGXC_NODE_GTM_PROXY PGXC_NODE_GTM_PROXY_POSTMASTER PGXC_NODE_COORDINATOR PGXC_NODE_DATANODE If a Postmaster gets a connection on GTM, it also identifies itself like this. This is a prerequisite for Node and Proxy registration on GTM. diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index fafa0b7..a464578 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -16,6 +16,8 @@ #include "access/gtm.h" #include "access/transam.h" #include "utils/elog.h" +#include "miscadmin.h" +#include "pgxc/pgxc.h" /* Configuration variables */ char *GtmHost = "localhost"; @@ -29,7 +31,6 @@ static GTM_Conn *conn; #define CheckConnection() \ if (GTMPQstatus(conn) != CONNECTION_OK) InitGTM() - bool IsGTMConnected() { @@ -42,7 +43,21 @@ InitGTM() /* 256 bytes should be enough */ char conn_str[256]; - sprintf(conn_str, "host=%s port=%d coordinator_id=%d", GtmHost, GtmPort, PGXCNodeId); + /* If this thread is postmaster itself, it contacts gtm identifying itself */ + if (!IsUnderPostmaster) + { + GTM_PGXCNodeType remote_type = PGXC_NODE_DEFAULT; + + if (IS_PGXC_COORDINATOR) + remote_type = PGXC_NODE_COORDINATOR; + else if (IS_PGXC_DATANODE) + remote_type = PGXC_NODE_DATANODE; + + sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d remote_type=%d postmaster=1", + GtmHost, GtmPort, PGXCNodeId, remote_type); + } + else + sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d", GtmHost, GtmPort, PGXCNodeId); conn = PQconnectGTM(conn_str); if (GTMPQstatus(conn) != CONNECTION_OK) @@ -51,9 +66,9 @@ InitGTM() ereport(WARNING, (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("can not connect to GTM: %m"))); + errmsg("can not connect to GTM: %m"))); - errno = save_errno; + errno = save_errno; CloseGTM(); } diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 29d8fe4..5a2ad1d 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -54,8 +54,9 @@ static const GTMPQconninfoOption GTMPQconninfoOptions[] = { {"host", NULL}, {"hostaddr", NULL}, {"port", NULL}, - {"coordinator_id", NULL}, - {"proxy", NULL}, + {"pgxc_node_id", NULL}, + {"remote_type", NULL}, + {"postmaster", NULL}, /* Terminating entry --- MUST BE LAST */ {NULL, NULL} }; @@ -168,10 +169,12 @@ connectOptions1(GTM_Conn *conn, const char *conninfo) conn->pgport = tmp ? strdup(tmp) : NULL; tmp = conninfo_getval(connOptions, "connect_timeout"); conn->connect_timeout = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "coordinator_id"); - conn->coordinator_id = tmp ? strdup(tmp) : NULL; - tmp = conninfo_getval(connOptions, "proxy"); - conn->is_proxy = tmp ? atoi(tmp) : 0; + tmp = conninfo_getval(connOptions, "pgxc_node_id"); + conn->pgxc_node_id = tmp ? strdup(tmp) : NULL; + tmp = conninfo_getval(connOptions, "postmaster"); + conn->is_postmaster = tmp ? atoi(tmp) : 0; + tmp = conninfo_getval(connOptions, "remote_type"); + conn->remote_type = tmp ? atoi(tmp) : PGXC_NODE_DEFAULT; /* * Free the option info - all is in conn now @@ -661,14 +664,15 @@ keep_going: /* We will come back to here until there is /* * Build a startup packet. We tell the GTM server/proxy our - * coordinator ID and whether we are a proxy or not. + * PGXC Node ID and whether we are a proxy or not. * * When the connection is made from the proxy, we let the GTM * server know about it so that some special headers are * handled correctly by the server. */ - sp.sp_cid = atoi(conn->coordinator_id); - sp.sp_isproxy = conn->is_proxy; + sp.sp_cid = atoi(conn->pgxc_node_id); + sp.sp_remotetype = conn->remote_type; + sp.sp_ispostmaster = conn->is_postmaster; /* * Send the startup packet. diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index cb735c2..117f89f 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -57,7 +57,7 @@ pqParseInput(GTM_Conn *conn) memset(conn->result, 0, sizeof (GTM_Result)); } else - gtmpqFreeResultData(conn->result, conn->is_proxy); + gtmpqFreeResultData(conn->result, conn->remote_type); result = conn->result; @@ -186,7 +186,7 @@ gtmpqGetError(GTM_Conn *conn, GTM_Result *result) * If we are a GTM proxy, expect an additional proxy header in the incoming * message. */ - if (conn->is_proxy) + if (conn->remote_type == PGXC_NODE_GTM_PROXY) { if (gtmpqGetnchar((char *)&result->gr_proxyhdr, sizeof (GTM_ProxyMsgHeader), conn)) @@ -298,7 +298,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) return 1; result->gr_msglen -= 4; - if (conn->is_proxy) + if (conn->remote_type == PGXC_NODE_GTM_PROXY) { if (gtmpqGetnchar((char *)&result->gr_proxyhdr, sizeof (GTM_ProxyMsgHeader), conn)) @@ -623,14 +623,14 @@ gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn) } void -gtmpqFreeResultData(GTM_Result *result, bool is_proxy) +gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type) { /* * If we are running as a GTM proxy, we don't have anything to do. This may * change though as we add more message types below and some of them may * need cleanup even at the proxy level */ - if (is_proxy) + if (remote_type == PGXC_NODE_GTM_PROXY) return; switch (result->gr_type) diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index 6ff3996..08911d8 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -27,7 +27,7 @@ #include "gtm/gtm_msg.h" #include "gtm/assert.h" -void GTM_FreeResult(GTM_Result *result, bool is_proxy); +void GTM_FreeResult(GTM_Result *result, GTM_PGXCNodeType remote_type); /* * Connection Management API @@ -279,7 +279,7 @@ start_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid, gtmpqPutc(true, conn) || gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn) || /* Send also GID for an explicit prepared transaction */ - gtmpqPutInt(strlen(gid), sizeof (GTM_GIDLen), conn) || + gtmpqPutInt(strlen(gid), sizeof (GTM_StrLen), conn) || gtmpqPutnchar((char *) gid, strlen(gid), conn) || gtmpqPutInt(datanodecnt, sizeof (int), conn) || gtmpqPutInt(coordcnt, sizeof (int), conn)) @@ -386,7 +386,7 @@ get_gid_data(GTM_Conn *conn, gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) || gtmpqPutc(txn_read_only, conn) || /* Send also GID for an explicit prepared transaction */ - gtmpqPutInt(strlen(gid), sizeof (GTM_GIDLen), conn) || + gtmpqPutInt(strlen(gid), sizeof (GTM_StrLen), conn) || gtmpqPutnchar((char *) gid, strlen(gid), conn)) goto send_failed; @@ -792,10 +792,10 @@ send_failed: } void -GTM_FreeResult(GTM_Result *result, bool is_proxy) +GTM_FreeResult(GTM_Result *result, GTM_PGXCNodeType remote_type) { if (result == NULL) return; - gtmpqFreeResultData(result, is_proxy); + gtmpqFreeResultData(result, remote_type); free(result); } diff --git a/src/gtm/client/test/test_seq.c b/src/gtm/client/test/test_seq.c index da0ed91..ba1981e 100644 --- a/src/gtm/client/test/test_seq.c +++ b/src/gtm/client/test/test_seq.c @@ -15,8 +15,12 @@ main(int argc, char *argv[]) { int ii; pid_t parent_pid; + GTM_Conn *conn = NULL; + char connect_string[100]; - GTM_Conn *conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + sprintf(connect_string, "host=%s port=%d pgxc_node_id=1 remote_type=%d", PGXC_NODE_COORDINATOR); + + conn = PQconnectGTM(connect_string); if (conn == NULL) { client_log(("Error in connection")); @@ -36,7 +40,7 @@ main(int argc, char *argv[]) seqkey.gsk_keylen = strlen(buf); seqkey.gsk_key = buf; if (open_sequence(conn, &seqkey, 10, 1, 10000, 100, false)) - client_log(("Open seq failed\n")); + client_log(("Open seq failed\n")); else client_log(("Opened Sequence %s\n", seqkey.gsk_key)); } @@ -55,7 +59,7 @@ main(int argc, char *argv[]) /* * Each process now opens a new connection with the GTM */ - conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + conn = PQconnectGTM(connect_string); /* * Try to read/increment the sequence diff --git a/src/gtm/client/test/test_snap.c b/src/gtm/client/test/test_snap.c index a2ce2f9..718ad3c 100644 --- a/src/gtm/client/test/test_snap.c +++ b/src/gtm/client/test/test_snap.c @@ -16,11 +16,14 @@ main(int argc, char *argv[]) int ii; GlobalTransactionId gxid[4000]; GTM_Conn *conn; + char connect_string[100]; for (ii = 0; ii < 3; ii++) fork(); - conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + sprintf(connect_string, "host=localhost port=6666 pgxc_node_id=1 remote_type=%d", PGXC_NODE_COORDINATOR); + + conn = PQconnectGTM(connect_string); if (conn == NULL) { client_log(("Error in connection\n")); diff --git a/src/gtm/client/test/test_snapperf.c b/src/gtm/client/test/test_snapperf.c index bc0e511..3218c2a 100644 --- a/src/gtm/client/test/test_snapperf.c +++ b/src/gtm/client/test/test_snapperf.c @@ -22,8 +22,11 @@ main(int argc, char *argv[]) GlobalTransactionId gxid[TXN_COUNT]; GTM_Conn *conn; + char connect_string[100]; - conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + sprintf(connect_string, "host=localhost port=6666 pgxc_node_id=1 remote_type=%d", PGXC_NODE_COORDINATOR); + + conn = PQconnectGTM(connect_string); if (conn == NULL) { client_log(("Error in connection\n")); diff --git a/src/gtm/client/test/test_txn.c b/src/gtm/client/test/test_txn.c index 01ed3de..2c805de 100644 --- a/src/gtm/client/test/test_txn.c +++ b/src/gtm/client/test/test_txn.c @@ -17,11 +17,15 @@ main(int argc, char *argv[]) int ii; GlobalTransactionId gxid[4000]; GTM_Conn *conn; + char connect_string[100]; + GTM_Timestamp *timestamp; for (ii = 0; ii < 3; ii++) fork(); - conn = PQconnectGTM("host=localhost port=6666 coordinator_id=1"); + sprintf(connect_string, "host=localhost port=6666 pgxc_node_id=1 remote_type=%d", PGXC_NODE_COORDINATOR); + + conn = PQconnectGTM(connect_string); if (conn == NULL) { client_log(("Error in connection\n")); @@ -30,7 +34,7 @@ main(int argc, char *argv[]) for (ii = 0; ii < 20; ii++) { - gxid[ii] = begin_transaction(conn, GTM_ISOLATION_SERIALIZABLE); + gxid[ii] = begin_transaction(conn, GTM_ISOLATION_SERIALIZABLE, timestamp); if (gxid[ii] != InvalidGlobalTransactionId) client_log(("Started a new transaction (GXID:%u)\n", gxid[ii])); else @@ -43,7 +47,7 @@ main(int argc, char *argv[]) nodes[0] = 1; nodes[1] = 1; - if (!prepare_transaction(conn, gxid[ii], 2, nodes)) + if (!prepare_transaction(conn, gxid[ii])) client_log(("PREPARE successful (GXID:%u)\n", gxid[ii])); else client_log(("PREPARE failed (GXID:%u)\n", gxid[ii])); diff --git a/src/gtm/client/test/test_txnperf.c b/src/gtm/client/test/test_txnperf.c index 174f0a8..04b218e 100644 --- a/src/gtm/client/test/test_txnperf.c +++ b/src/gtm/client/test/test_txnperf.c @@ -56,7 +56,7 @@ main(int argc, char *argv[]) int kk; char connect_string[100]; int gtmport; - int coordinator_id; + PGXCNodeId pgxc_node_id; int nclients; int ntxns_per_cli; int nstmts_per_txn; @@ -119,10 +119,10 @@ main(int argc, char *argv[]) break; case 'i': - coordinator_id = atoi(optarg); - sprintf(test_output, "TEST_OUTPUT_%d\0", coordinator_id); - sprintf(test_end, "TEST_END_%d\0", coordinator_id); - sprintf(test_output_csv, "TEST_OUTPUT_%d.CSV\0", coordinator_id); + pgxc_node_id = atoi(optarg); + sprintf(test_output, "TEST_OUTPUT_%d\0", pgxc_node_id); + sprintf(test_end, "TEST_END_%d\0", pgxc_node_id); + sprintf(test_output_csv, "TEST_OUTPUT_%d.CSV\0", pgxc_node_id); break; default: @@ -132,7 +132,7 @@ main(int argc, char *argv[]) } } - sprintf(connect_string, "host=%s port=%d coordinator_id=%d", gtmhost, gtmport, coordinator_id); + sprintf(connect_string, "host=%s port=%d pgxc_node_id=%d remote_type=%d", gtmhost, gtmport, pgxc_node_id, PGXC_NODE_COORDINATOR); sprintf(system_cmd, "echo -------------------------------------------------------- >> %s", test_output); system(system_cmd); diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c index 626dc36..65f28a2 100644 --- a/src/gtm/common/elog.c +++ b/src/gtm/common/elog.c @@ -859,7 +859,7 @@ send_message_to_frontend(Port *myport, ErrorData *edata) /* 'N' (Notice) is for nonfatal conditions, 'E' is for errors */ pq_beginmessage(&msgbuf, (edata->elevel < ERROR) ? 'N' : 'E'); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c index 6d946a9..390d0f6 100644 --- a/src/gtm/main/gtm_seq.c +++ b/src/gtm/main/gtm_seq.c @@ -131,7 +131,7 @@ seq_release_seqinfo(GTM_SeqInfo *seqinfo) GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE); Assert(seqinfo->gs_ref_count > 0); seqinfo->gs_ref_count--; - + if ((seqinfo->gs_state == SEQ_STATE_DELETED) && (seqinfo->gs_ref_count == 0)) remove = true; @@ -880,7 +880,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message) */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_INIT_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -890,7 +890,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -948,7 +948,7 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_ALTER_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -958,7 +958,7 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -984,7 +984,7 @@ ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_GET_CURRENT_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -995,7 +995,7 @@ ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -1020,7 +1020,7 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_GET_NEXT_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1031,7 +1031,7 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -1078,7 +1078,7 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_SET_VAL_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1088,7 +1088,7 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -1112,7 +1112,7 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_RESET_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1122,7 +1122,7 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -1148,7 +1148,7 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_CLOSE_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1158,7 +1158,7 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } @@ -1200,7 +1200,7 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message) /* Send a SUCCESS message back to the client */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SEQUENCE_RENAME_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1210,7 +1210,7 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, newseqkey.gsk_key, newseqkey.gsk_keylen); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); } diff --git a/src/gtm/main/gtm_snap.c b/src/gtm/main/gtm_snap.c index 5c9b4b2..5c0125e 100644 --- a/src/gtm/main/gtm_snap.c +++ b/src/gtm/main/gtm_snap.c @@ -345,7 +345,7 @@ ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, get_gxid ? SNAPSHOT_GXID_GET_RESULT : SNAPSHOT_GET_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -362,7 +362,7 @@ ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid) sizeof(GlobalTransactionId) * snapshot->sn_xcnt); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; @@ -426,7 +426,7 @@ ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, SNAPSHOT_GET_MULTI_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -442,7 +442,7 @@ ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message) sizeof(GlobalTransactionId) * snapshot->sn_xcnt); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index f388197..252fc43 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -632,7 +632,7 @@ SetNextGlobalTransactionId(GlobalTransactionId gxid) /* Transaction Control */ int -GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, +GTM_BeginTransactionMulti(GTM_PGXCNodeId coord_id, GTM_IsolationLevel isolevel[], bool readonly[], GTMProxy_ConnID connid[], @@ -729,7 +729,7 @@ GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, /* Transaction Control */ GTM_TransactionHandle -GTM_BeginTransaction(GTM_CoordinatorId coord_id, +GTM_BeginTransaction(GTM_PGXCNodeId coord_id, GTM_IsolationLevel isolevel, bool readonly) { @@ -1029,7 +1029,7 @@ ProcessBeginTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_BEGIN_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1039,7 +1039,7 @@ ProcessBeginTransactionCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)×tamp, sizeof (GTM_Timestamp)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1089,7 +1089,7 @@ ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_BEGIN_GETGXID_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1099,7 +1099,7 @@ ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)×tamp, sizeof (GTM_Timestamp)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1150,7 +1150,7 @@ ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1159,7 +1159,7 @@ ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1227,7 +1227,7 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_BEGIN_GETGXID_MULTI_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1238,7 +1238,7 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&(timestamp), sizeof (GTM_Timestamp)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1291,7 +1291,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_COMMIT_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1301,7 +1301,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message) pq_sendint(&buf, status, sizeof(status)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1362,7 +1362,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_COMMIT_PREPARED_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1372,7 +1372,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message) pq_sendint(&buf, status[0], 4); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1408,7 +1408,7 @@ ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) txn_read_only = pq_getmsgbyte(message); /* receive GID */ - gidlen = pq_getmsgint(message, sizeof (GTM_GIDLen)); + gidlen = pq_getmsgint(message, sizeof (GTM_StrLen)); gid = (char *)pq_getmsgbytes(message, gidlen); pq_getmsgend(message); @@ -1451,7 +1451,7 @@ ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) */ pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_GET_GID_DATA_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1473,7 +1473,7 @@ ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1526,7 +1526,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_ROLLBACK_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1536,7 +1536,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message) pq_sendint(&buf, status, sizeof(status)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1598,7 +1598,7 @@ ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_COMMIT_MULTI_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1608,7 +1608,7 @@ ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1669,7 +1669,7 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_ROLLBACK_MULTI_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1679,7 +1679,7 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)status, sizeof(int) * txn_count); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1695,7 +1695,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message) GlobalTransactionId gxid; int isgxid = 0; int datanodecnt,coordcnt; - GTM_GIDLen gidlen; + GTM_StrLen gidlen; PGXC_NodeId *coordinators = NULL; PGXC_NodeId *datanodes = NULL; MemoryContext oldContext; @@ -1724,7 +1724,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message) } /* get GID */ - gidlen = pq_getmsgint(message, sizeof (GTM_GIDLen)); + gidlen = pq_getmsgint(message, sizeof (GTM_StrLen)); gid = (char *)pq_getmsgbytes(message, gidlen); /* Get Datanode Count Data */ @@ -1768,7 +1768,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_START_PREPARED_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1777,7 +1777,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&gxid, sizeof(GlobalTransactionId)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1830,7 +1830,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_PREPARE_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1839,7 +1839,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } @@ -1885,7 +1885,7 @@ ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message) pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_GET_GXID_RESULT, 4); - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) { GTM_ProxyMsgHeader proxyhdr; proxyhdr.ph_conid = myport->conn_id; @@ -1895,7 +1895,7 @@ ProcessGetGXIDTransactionCommand(Port *myport, StringInfo message) pq_sendbytes(&buf, (char *)&gxid, sizeof(gxid)); pq_endmessage(myport, &buf); - if (!myport->is_proxy) + if (myport->remote_type != PGXC_NODE_GTM_PROXY) pq_flush(myport); return; } diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 1cba1ea..118faab 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -75,8 +75,8 @@ static void ProcessSnapshotCommand(Port *myport, GTM_MessageType mtype, StringIn static void ProcessSequenceCommand(Port *myport, GTM_MessageType mtype, StringInfo message); static void ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message); -static void GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id); -static void GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId coordinator_id); +static void GTM_RegisterPGXCNode(Port *myport, GTM_PGXCNodeId pgxc_node_id); +static void GTM_UnregisterPGXCNode(Port *myport, GTM_PGXCNodeId pgxc_node_id); static bool CreateOptsFile(int argc, char *argv[]); static void CreateDataDirLockFile(void); @@ -608,8 +608,9 @@ GTM_ThreadMain(void *argp) sizeof (GTM_StartupPacket)); pq_getmsgend(&inBuf); - GTM_RegisterCoordinator(thrinfo->thr_conn->con_port, sp.sp_cid); - thrinfo->thr_conn->con_port->is_proxy = sp.sp_isproxy; + GTM_RegisterPGXCNode(thrinfo->thr_conn->con_port, sp.sp_cid); + thrinfo->thr_conn->con_port->remote_type = sp.sp_remotetype; + thrinfo->thr_conn->con_port->is_postmaster = sp.sp_ispostmaster; } { @@ -751,7 +752,7 @@ ProcessCommand(Port *myport, StringInfo input_message) GTM_MessageType mtype; GTM_ProxyMsgHeader proxyhdr; - if (myport->is_proxy) + if (myport->remote_type == PGXC_NODE_GTM_PROXY) pq_copymsgbytes(input_message, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); else proxyhdr.ph_conid = InvalidGTMProxyConnID; @@ -918,14 +919,14 @@ ReadCommand(Port *myport, StringInfo inBuf) static void ProcessCoordinatorCommand(Port *myport, GTM_MessageType mtype, StringInfo message) { - GTM_CoordinatorId cid; + GTM_PGXCNodeId cid; - cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId)); + cid = pq_getmsgint(message, sizeof (GTM_PGXCNodeId)); switch (mtype) { case MSG_UNREGISTER_COORD: - GTM_UnregisterCoordinator(myport, cid); + GTM_UnregisterPGXCNode(myport, cid); break; default: @@ -1079,15 +1080,15 @@ ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message) } static void -GTM_RegisterCoordinator(Port *myport, GTM_CoordinatorId cid) +GTM_RegisterPGXCNode(Port *myport, GTM_PGXCNodeId cid) { elog(DEBUG3, "Registering coordinator with cid %d", cid); - myport->coordinator_id = cid; + myport->pgxc_node_id = cid; } static void -GTM_UnregisterCoordinator(Port *myport, GTM_CoordinatorId cid) +GTM_UnregisterPGXCNode(Port *myport, GTM_PGXCNodeId cid) { /* * Do a clean shutdown diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 30f1d1b..4275d91 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -85,14 +85,16 @@ static void ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo, static void ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); static void ProcessSnapshotCommand(GTMProxy_ConnectionInfo *conninfo, - GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); static void ProcessSequenceCommand(GTMProxy_ConnectionInfo *conninfo, - GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); + GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message); -static void GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, - GTM_CoordinatorId coordinator_id); -static void GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo, - GTM_CoordinatorId coordinator_id); +static void GTMProxy_RegisterPGXCNode(GTMProxy_ConnectionInfo *conninfo, + GTM_PGXCNodeId cid, + GTM_PGXCNodeType remote_type, + bool is_postmaster); +static void GTMProxy_UnregisterPGXCNode(GTMProxy_ConnectionInfo *conninfo, + GTM_PGXCNodeId pgxc_node_id); static void ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, GTM_Result *res); @@ -598,8 +600,8 @@ GTMProxy_ThreadMain(void *argp) /* * Set up connection with the GTM server */ - sprintf(gtm_connect_string, "host=%s port=%d coordinator_id=1 proxy=1", - GTMServerHost, GTMServerPortNumber); + sprintf(gtm_connect_string, "host=%s port=%d pgxc_node_id=1 remote_type=%d", + GTMServerHost, GTMServerPortNumber, PGXC_NODE_GTM_PROXY); thrinfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string); @@ -1244,14 +1246,14 @@ static void ProcessCoordinatorCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, GTM_MessageType mtype, StringInfo message) { - GTM_CoordinatorId cid; + GTM_PGXCNodeId cid; - cid = pq_getmsgint(message, sizeof (GTM_CoordinatorId)); + cid = pq_getmsgint(message, sizeof (GTM_PGXCNodeId)); switch (mtype) { case MSG_UNREGISTER_COORD: - GTMProxy_UnregisterCoordinator(conninfo, cid); + GTMProxy_UnregisterPGXCNode(conninfo, cid); break; default: @@ -1448,15 +1450,17 @@ GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype return; } static void -GTMProxy_RegisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid) +GTMProxy_RegisterPGXCNode(GTMProxy_ConnectionInfo *conninfo, GTM_PGXCNodeId cid, GTM_PGXCNodeType remote_type, bool is_postmaster) { - elog(DEBUG3, "Registering coordinator with cid %d", cid); - conninfo->con_port->coordinator_id = cid; + elog(DEBUG3, "Registering PGXC Node with cid %d", cid); + conninfo->con_port->pgxc_node_id = cid; + conninfo->con_port->remote_type = remote_type; + conninfo->con_port->is_postmaster = is_postmaster; } static void -GTMProxy_UnregisterCoordinator(GTMProxy_ConnectionInfo *conninfo, GTM_CoordinatorId cid) +GTMProxy_UnregisterPGXCNode(GTMProxy_ConnectionInfo *conninfo, GTM_PGXCNodeId cid) { /* * Do a clean shutdown @@ -1502,7 +1506,7 @@ GTMProxy_HandshakeConnection(GTMProxy_ConnectionInfo *conninfo) sizeof (GTM_StartupPacket)); pq_getmsgend(&inBuf); - GTMProxy_RegisterCoordinator(conninfo, sp.sp_cid); + GTMProxy_RegisterPGXCNode(conninfo, sp.sp_cid, sp.sp_remotetype, sp.sp_ispostmaster); /* * Send a dummy authentication request message 'R' as the client diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h index 37e23a7..77522b2 100644 --- a/src/include/gtm/gtm.h +++ b/src/include/gtm/gtm.h @@ -131,10 +131,6 @@ extern MemoryContext TopMostMemoryContext; #if 0 -/* Coordinator registration */ -int GTM_RegisterCoordinator(GTM_CoordInfo *cinfo); -int GTM_UnregisterCoordinator(GTM_CoordinatorId cid); - #endif #endif diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index e8b9984..f918592 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -36,14 +36,26 @@ typedef uint32 GlobalTransactionId; /* 32-bit global transaction ids */ typedef uint32 PGXC_NodeId; -typedef uint32 GTM_CoordinatorId; typedef int16 GTMProxy_ConnID; -typedef uint32 GTM_GIDLen; +typedef uint32 GTM_StrLen; #define InvalidGTMProxyConnID -1 typedef pthread_t GTM_ThreadID; +typedef uint32 GTM_PGXCNodeId; +typedef uint32 GTM_PGXCNodePort; + +/* Possible type of nodes for registration */ +typedef enum GTM_PGXCNodeType +{ + PGXC_NODE_GTM_PROXY, + PGXC_NODE_GTM_PROXY_POSTMASTER, /* Used by Proxy to communicate with GTM and not use Proxy headers */ + PGXC_NODE_COORDINATOR, + PGXC_NODE_DATANODE, + PGXC_NODE_DEFAULT /* In case nothing is associated to connection */ +} GTM_PGXCNodeType; + /* * A unique handle to identify transaction at the GTM. It could just be * an index in an array or a pointer to the structure @@ -105,8 +117,9 @@ typedef struct GTM_SnapshotData typedef GTM_SnapshotData *GTM_Snapshot; typedef struct GTM_StartupPacket { - GTM_CoordinatorId sp_cid; - bool sp_isproxy; + GTM_PGXCNodeId sp_cid; + GTM_PGXCNodeType sp_remotetype; + bool sp_ispostmaster; } GTM_StartupPacket; #define InvalidGlobalTransactionId ((GlobalTransactionId) 0) diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h index c883612..47444c6 100644 --- a/src/include/gtm/gtm_txn.h +++ b/src/include/gtm/gtm_txn.h @@ -183,10 +183,10 @@ GTM_TransactionHandle GTM_GIDToHandle(char *gid); /* Transaction Control */ void GTM_InitTxnManager(void); -GTM_TransactionHandle GTM_BeginTransaction(GTM_CoordinatorId coord_id, +GTM_TransactionHandle GTM_BeginTransaction(GTM_PGXCNodeId pgxc_node_id, GTM_IsolationLevel isolevel, bool readonly); -int GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, +int GTM_BeginTransactionMulti(GTM_PGXCNodeId pgxc_node_id, GTM_IsolationLevel isolevel[], bool readonly[], GTMProxy_ConnID connid[], diff --git a/src/include/gtm/libpq-be.h b/src/include/gtm/libpq-be.h index 0a795de..f8036fe 100644 --- a/src/include/gtm/libpq-be.h +++ b/src/include/gtm/libpq-be.h @@ -47,8 +47,9 @@ typedef struct Port GTMProxy_ConnID conn_id; /* RequestID of this command */ - GTM_CoordinatorId coordinator_id; /* Coordinator ID */ - bool is_proxy; /* Is this a connection from GTM proxy ? */ + GTM_PGXCNodeType remote_type; /* Type of remote connection */ + GTM_PGXCNodeId pgxc_node_id; /* Coordinator ID */ + bool is_postmaster; /* Is remote a node postmaster? */ #define PQ_BUFFER_SIZE 8192 char PqSendBuffer[PQ_BUFFER_SIZE]; diff --git a/src/include/gtm/libpq-int.h b/src/include/gtm/libpq-int.h index 5956de8..557a441 100644 --- a/src/include/gtm/libpq-int.h +++ b/src/include/gtm/libpq-int.h @@ -42,9 +42,9 @@ struct gtm_conn * over above. */ char *pgport; /* the server's communication port */ char *connect_timeout; /* connection timeout (numeric string) */ - char *coordinator_id; /* coordinator id */ - int is_proxy; /* is this a connection to/from a proxy ? */ - + char *pgxc_node_id; /* PGXC Node id */ + int remote_type; /* is this a connection to/from a proxy ? */ + int is_postmaster; /* is this connection to/from a postmaster instance */ /* Optional file to write trace info to */ FILE *Pfdebug; @@ -121,7 +121,7 @@ extern int gtmpqWriteReady(GTM_Conn *conn); */ GTM_Result * GTMPQgetResult(GTM_Conn *conn); extern int gtmpqGetError(GTM_Conn *conn, GTM_Result *result); -void gtmpqFreeResultData(GTM_Result *result, bool is_proxy); +void gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type); #define SOCK_ERRNO errno #define SOCK_ERRNO_SET(e) (errno = (e)) ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/gtm.c | 23 +++++++++++-- src/gtm/client/fe-connect.c | 22 +++++++----- src/gtm/client/fe-protocol.c | 10 +++--- src/gtm/client/gtm_client.c | 10 +++--- src/gtm/client/test/test_seq.c | 10 ++++-- src/gtm/client/test/test_snap.c | 5 ++- src/gtm/client/test/test_snapperf.c | 5 ++- src/gtm/client/test/test_txn.c | 10 ++++-- src/gtm/client/test/test_txnperf.c | 12 +++--- src/gtm/common/elog.c | 2 +- src/gtm/main/gtm_seq.c | 34 +++++++++--------- src/gtm/main/gtm_snap.c | 8 ++-- src/gtm/main/gtm_txn.c | 62 +++++++++++++++++----------------- src/gtm/main/main.c | 23 +++++++------ src/gtm/proxy/proxy_main.c | 36 +++++++++++--------- src/include/gtm/gtm.h | 4 -- src/include/gtm/gtm_c.h | 21 +++++++++-- src/include/gtm/gtm_txn.h | 4 +- src/include/gtm/libpq-be.h | 5 ++- src/include/gtm/libpq-int.h | 8 ++-- 20 files changed, 181 insertions(+), 133 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-12-13 22:35:28
|
Project "Postgres-XC". The branch, master has been updated via 75fbef774e81432cdd5ff4eeabf203b12be560a9 (commit) from 8506374787c98b149949d4fcbbb88b51b3b9a0fc (commit) - Log ----------------------------------------------------------------- commit 75fbef774e81432cdd5ff4eeabf203b12be560a9 Author: Mason Sharp <ma...@us...> Date: Mon Dec 13 17:31:27 2010 -0500 Fixed bug in INSERT when omitting a value for the partitioning column. diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 4c65f49..4191455 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -359,11 +359,15 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, } else { - /* - * No partitioning value passed in - * (no where qualification on part column - use all) - */ - exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); + /* If no info, go to node 1 */ + if (accessType == RELATION_ACCESS_WRITE) + exec_nodes->nodelist = lappend_int(NULL, 1); + else + /* + * No partitioning value passed in + * (no where qualification on part column - use all) + */ + exec_nodes->nodelist = list_copy(rel_loc_info->nodeList); } break; diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 4c677aa..fa61826 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -549,8 +549,8 @@ get_plan_nodes_insert(Query *query, RemoteQuery *step) if (!lc) { - /* give up */ - step->exec_nodes = NULL; + /* Skip rest, handle NULL */ + step->exec_nodes = GetRelationNodes(rel_loc_info, NULL, RELATION_ACCESS_WRITE); return; } ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/locator/locator.c | 14 +++++++++----- src/backend/pgxc/plan/planner.c | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-12-10 14:22:47
|
Project "Postgres-XC". The branch, master has been updated via 8506374787c98b149949d4fcbbb88b51b3b9a0fc (commit) from 8d1567d81d013065840497785ea1c34017b6d748 (commit) - Log ----------------------------------------------------------------- commit 8506374787c98b149949d4fcbbb88b51b3b9a0fc Author: Mason Sharp <ma...@us...> Date: Fri Dec 10 09:21:29 2010 -0500 Fixed a bug with INSERT SELECT when an input value is NULL. Reported by Benny, fix by Mason. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 55786ba..436a1dd 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3943,7 +3943,6 @@ DoInsertSelectCopy(EState *estate, TupleTableSlot *slot) attnamelist = lappend(attnamelist, makeString(target->resname)); } cstate->attnumlist = CopyGetAttnums(cstate->tupDesc, cstate->rel, attnamelist); - cstate->null_print_client = cstate->null_print; /* default */ /* We use fe_msgbuf as a per-row buffer regardless of copy_dest */ cstate->fe_msgbuf = makeStringInfo(); @@ -3975,6 +3974,7 @@ DoInsertSelectCopy(EState *estate, TupleTableSlot *slot) if (!cstate->null_print) cstate->null_print = cstate->csv_mode ? "" : "\\N"; cstate->null_print_len = strlen(cstate->null_print); + cstate->null_print_client = cstate->null_print; /* default */ if (cstate->csv_mode) { ----------------------------------------------------------------------- Summary of changes: src/backend/commands/copy.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-12-09 23:13:35
|
Project "Postgres-XC". The branch, master has been updated via 8d1567d81d013065840497785ea1c34017b6d748 (commit) from 1c8a6a3058924e25d680477ab87bdf80de447e61 (commit) - Log ----------------------------------------------------------------- commit 8d1567d81d013065840497785ea1c34017b6d748 Author: Mason Sharp <ma...@us...> Date: Thu Dec 9 17:54:29 2010 -0500 Fix a problem when more values are selected than total columns in the query. Example: SELECT *, CTID FROM mytable. From Andrei: I found the problem was in createplan.c, create_remotequery_plan() function. It built up column list using PGXC-added TupleDesc originally taken from the rd_att field of RelationData structure, which does not include system columns. I changed this and deparse target list expressions instead. This change fixes the CTID problem, and allowed to remove the TupleDesc field from RelOptInfo which is not used any more. By Andrei Martsinchyk diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 3b20297..63893b4 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2216,7 +2216,6 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, Index scan_relid = best_path->parent->relid; RangeTblEntry *rte; char *wherestr = NULL; - Bitmapset *varattnos = NULL; List *remote_scan_clauses = NIL; List *local_scan_clauses = NIL; Oid nspid; @@ -2225,8 +2224,8 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, const char *nspname_q; const char *relname_q; const char *aliasname_q; - int i; - TupleDesc tupdesc; + ListCell *lc; + List *deparse_context; bool first; StringInfoData sql; RelationLocInfo *rel_loc_info; @@ -2237,6 +2236,9 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, Assert(best_path->parent->rtekind == RTE_RELATION); Assert(rte->rtekind == RTE_RELATION); + deparse_context = deparse_context_for_remotequery( + get_rel_name(rte->relid), rte->relid); + /* Sort clauses into best execution order */ scan_clauses = order_qual_clauses(root, scan_clauses); @@ -2246,7 +2248,7 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, if (scan_clauses) { ListCell *l; - + foreach(l, (List *)scan_clauses) { Node *clause = lfirst(l); @@ -2258,7 +2260,7 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, } } - /* + /* * Incorporate any remote_scan_clauses into the WHERE clause that * we intend to push to the remote server. */ @@ -2267,13 +2269,9 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, char *sep = ""; ListCell *l; StringInfoData buf; - List *deparse_context; initStringInfo(&buf); - deparse_context = deparse_context_for_remotequery( - get_rel_name(rte->relid), rte->relid); - /* * remote_scan_clauses is a list of scan clauses (restrictions) that we * can push to the remote server. We want to deparse each of those @@ -2289,20 +2287,11 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, appendStringInfo(&buf, "%s", deparse_expression(clause, deparse_context, false, false)); sep = " AND "; } - + wherestr = buf.data; } /* - * Now walk through the target list and the scan clauses to get the - * interesting attributes. Only those attributes will be fetched from the - * remote side. - */ - varattnos = pull_varattnos_varno((Node *) best_path->parent->reltargetlist, best_path->parent->relid, - varattnos); - varattnos = pull_varattnos_varno((Node *) local_scan_clauses, - best_path->parent->relid, varattnos); - /* * Scanning multiple relations in a RemoteQuery node is not supported. */ prefix = false; @@ -2331,27 +2320,18 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, * columns because some columns may be used only in parent Sort/Agg/Limit * nodes. */ - tupdesc = best_path->parent->reltupdesc; first = true; - for (i = 0; i < tupdesc->natts; i++) + foreach (lc, tlist) { - /* skip dropped attributes */ - if (tupdesc->attrs[i]->attisdropped) - continue; + TargetEntry *tle = (TargetEntry *) lfirst(lc); if (!first) appendStringInfoString(&sql, ", "); - if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber, varattnos)) - { - if (prefix) - appendStringInfo(&sql, "%s.%s", - aliasname_q, tupdesc->attrs[i]->attname.data); - else - appendStringInfo(&sql, "%s", tupdesc->attrs[i]->attname.data); - } - else - appendStringInfo(&sql, "%s", "NULL"); + appendStringInfo(&sql, "%s", deparse_expression((Node *) tle->expr, + deparse_context, + false, + false)); first = false; } @@ -2377,13 +2357,10 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, if (wherestr) { - appendStringInfo(&sql, " WHERE "); - appendStringInfo(&sql, "%s", wherestr); + appendStringInfo(&sql, " WHERE %s", wherestr); pfree(wherestr); } - bms_free(varattnos); - scan_plan = make_remotequery(tlist, rte, local_scan_clauses, diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index b1c8bcb..04bf594 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -92,10 +92,6 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind) rel->index_outer_relids = NULL; rel->index_inner_paths = NIL; -#ifdef PGXC - rel->reltupdesc = rte->reltupdesc; -#endif - /* Check type of rtable entry */ switch (rte->rtekind) { diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index d537855..6f15f48 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -382,10 +382,6 @@ typedef struct RelOptInfo * clauses */ List *index_inner_paths; /* InnerIndexscanInfo nodes */ -#ifdef PGXC - TupleDesc reltupdesc; -#endif - /* * Inner indexscans are not in the main pathlist because they are not * usable except in specific join contexts. We use the index_inner_paths ----------------------------------------------------------------------- Summary of changes: src/backend/optimizer/plan/createplan.c | 53 +++++++++---------------------- src/backend/optimizer/util/relnode.c | 4 -- src/include/nodes/relation.h | 4 -- 3 files changed, 15 insertions(+), 46 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-12-09 22:25:49
|
Project "Postgres-XC". The branch, master has been updated via 1c8a6a3058924e25d680477ab87bdf80de447e61 (commit) from 88a8a1f4ada3b76dd075b446c392789901ab4958 (commit) - Log ----------------------------------------------------------------- commit 1c8a6a3058924e25d680477ab87bdf80de447e61 Author: Mason Sharp <ma...@us...> Date: Thu Dec 9 17:15:44 2010 -0500 Add support for INSERT SELECT. Also, initial changes for setting execution nodes in general planner. We execute a query normally, directed by the Coordinator, and then use COPY running down on the data nodes to insert the data there. We also check for a special case of INSERT SELECT where the source and destination columns are both the distribution columns of tables, and where the query is a single step query. In such a case, we just execute the query locally on the data nodes without the use of COPY directed by the Coordinator. In testing we uncovered an issue for INSERT SELECT when the query is from a replicated table; it was selecting from all tables because exec_nodes was not set for the RemoteQuery. As a result, this commit also sets the exec_nodes for base RemoteQuery structs as well as join reduced ones. It does not yet, however, take into account WHERE clause equality conditions against the distribution column, as is done in a regular SELECT. This is left as a future optimization, best done as a step for further merging the Postgres-XC planner and the standard planner. diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index e1576fc..55786ba 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -61,6 +61,9 @@ typedef enum CopyDest COPY_FILE, /* to/from file */ COPY_OLD_FE, /* to/from frontend (2.0 protocol) */ COPY_NEW_FE /* to/from frontend (3.0 protocol) */ +#ifdef PGXC + ,COPY_BUFFER /* Do not send, just prepare */ +#endif } CopyDest; /* @@ -181,6 +184,7 @@ typedef struct CopyStateData int hash_idx; /* index of the hash column */ PGXCNodeHandle **connections; /* Involved data node connections */ + TupleDesc tupDesc; /* for INSERT SELECT */ #endif } CopyStateData; @@ -301,6 +305,17 @@ static void CopySendInt16(CopyState cstate, int16 val); static bool CopyGetInt16(CopyState cstate, int16 *val); +#ifdef PGXC +static ExecNodes *build_copy_statement(CopyState cstate, List *attnamelist, + TupleDesc tupDesc, bool is_from, List *force_quote, List *force_notnull); +/* + * A kluge here making this static to avoid having to move the + * CopyState definition to a header file making it harder to merge + * with the vanilla PostgreSQL code + */ +static CopyState insertstate; +#endif + /* * Send copy start/stop messages for frontend copies. These have changed * in past protocol redesigns. @@ -487,6 +502,11 @@ CopySendEndOfRow(CopyState cstate) /* Dump the accumulated row as one CopyData message */ (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len); break; +#ifdef PGXC + case COPY_BUFFER: + /* Do not send yet anywhere, just return */ + return; +#endif } resetStringInfo(fe_msgbuf); @@ -598,6 +618,11 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread) bytesread += avail; } break; +#ifdef PGXC + case COPY_BUFFER: + elog(ERROR, "COPY_BUFFER not allowed in this context"); + break; +#endif } return bytesread; @@ -1133,155 +1158,10 @@ DoCopy(const CopyStmt *stmt, const char *queryString) errmsg("table \"%s\" does not have OIDs", RelationGetRelationName(cstate->rel)))); #ifdef PGXC - /* Get locator information */ + /* Get copy statement and execution node information */ if (IS_PGXC_COORDINATOR) { - char *hash_att; - - exec_nodes = makeNode(ExecNodes); - - /* - * If target table does not exists on nodes (e.g. system table) - * the location info returned is NULL. This is the criteria, when - * we need to run Copy on coordinator - */ - cstate->rel_loc = GetRelationLocInfo(RelationGetRelid(cstate->rel)); - - hash_att = GetRelationHashColumn(cstate->rel_loc); - if (cstate->rel_loc) - { - if (is_from || hash_att) - exec_nodes->nodelist = list_copy(cstate->rel_loc->nodeList); - else - { - /* - * Pick up one node only - * This case corresponds to a replicated table with COPY TO - */ - exec_nodes->nodelist = GetAnyDataNode(); - } - } - - cstate->hash_idx = -1; - if (hash_att) - { - List *attnums; - ListCell *cur; - - attnums = CopyGetAttnums(tupDesc, cstate->rel, attnamelist); - foreach(cur, attnums) - { - int attnum = lfirst_int(cur); - if (namestrcmp(&(tupDesc->attrs[attnum - 1]->attname), hash_att) == 0) - { - cstate->hash_idx = attnum - 1; - break; - } - } - } - - /* - * Build up query string for the data nodes, it should match - * to original string, but should have STDIN/STDOUT instead - * of filename. - */ - initStringInfo(&cstate->query_buf); - - appendStringInfoString(&cstate->query_buf, "COPY "); - appendStringInfo(&cstate->query_buf, "%s", RelationGetRelationName(cstate->rel)); - - if (attnamelist) - { - ListCell *cell; - ListCell *prev = NULL; - appendStringInfoString(&cstate->query_buf, " ("); - foreach (cell, attnamelist) - { - if (prev) - appendStringInfoString(&cstate->query_buf, ", "); - CopyQuoteIdentifier(&cstate->query_buf, strVal(lfirst(cell))); - prev = cell; - } - appendStringInfoChar(&cstate->query_buf, ')'); - } - - if (stmt->is_from) - appendStringInfoString(&cstate->query_buf, " FROM STDIN"); - else - appendStringInfoString(&cstate->query_buf, " TO STDOUT"); - - - if (cstate->binary) - appendStringInfoString(&cstate->query_buf, " BINARY"); - - if (cstate->oids) - appendStringInfoString(&cstate->query_buf, " OIDS"); - - if (cstate->delim) - if ((!cstate->csv_mode && cstate->delim[0] != '\t') - || (cstate->csv_mode && cstate->delim[0] != ',')) - { - appendStringInfoString(&cstate->query_buf, " DELIMITER AS "); - CopyQuoteStr(&cstate->query_buf, cstate->delim); - } - - if (cstate->null_print) - if ((!cstate->csv_mode && strcmp(cstate->null_print, "\\N")) - || (cstate->csv_mode && strcmp(cstate->null_print, ""))) - { - appendStringInfoString(&cstate->query_buf, " NULL AS "); - CopyQuoteStr(&cstate->query_buf, cstate->null_print); - } - - if (cstate->csv_mode) - appendStringInfoString(&cstate->query_buf, " CSV"); - - /* - * Only rewrite the header part for COPY FROM, - * doing that for COPY TO results in multiple headers in output - */ - if (cstate->header_line && stmt->is_from) - appendStringInfoString(&cstate->query_buf, " HEADER"); - - if (cstate->quote && cstate->quote[0] == '"') - { - appendStringInfoString(&cstate->query_buf, " QUOTE AS "); - CopyQuoteStr(&cstate->query_buf, cstate->quote); - } - - if (cstate->escape && cstate->quote && cstate->escape[0] == cstate->quote[0]) - { - appendStringInfoString(&cstate->query_buf, " ESCAPE AS "); - CopyQuoteStr(&cstate->query_buf, cstate->escape); - } - - if (force_quote) - { - ListCell *cell; - ListCell *prev = NULL; - appendStringInfoString(&cstate->query_buf, " FORCE QUOTE "); - foreach (cell, force_quote) - { - if (prev) - appendStringInfoString(&cstate->query_buf, ", "); - CopyQuoteIdentifier(&cstate->query_buf, strVal(lfirst(cell))); - prev = cell; - } - } - - if (force_notnull) - { - ListCell *cell; - ListCell *prev = NULL; - appendStringInfoString(&cstate->query_buf, " FORCE NOT NULL "); - foreach (cell, force_notnull) - { - if (prev) - appendStringInfoString(&cstate->query_buf, ", "); - CopyQuoteIdentifier(&cstate->query_buf, strVal(lfirst(cell))); - prev = cell; - } - } + exec_nodes = build_copy_statement(cstate, attnamelist, tupDesc, is_from, force_quote, force_notnull); } #endif } @@ -3848,3 +3728,324 @@ CreateCopyDestReceiver(void) return (DestReceiver *) self; } + +#ifdef PGXC +/* + * Rebuild a COPY statement in cstate and set ExecNodes + */ +static ExecNodes* +build_copy_statement(CopyState cstate, List *attnamelist, + TupleDesc tupDesc, bool is_from, List *force_quote, List *force_notnull) +{ + char *hash_att; + + + ExecNodes *exec_nodes = makeNode(ExecNodes); + + /* + * If target table does not exists on nodes (e.g. system table) + * the location info returned is NULL. This is the criteria, when + * we need to run Copy on coordinator + */ + cstate->rel_loc = GetRelationLocInfo(RelationGetRelid(cstate->rel)); + + hash_att = GetRelationHashColumn(cstate->rel_loc); + if (cstate->rel_loc) + { + if (is_from || hash_att) + exec_nodes->nodelist = list_copy(cstate->rel_loc->nodeList); + else + { + /* + * Pick up one node only + * This case corresponds to a replicated table with COPY TO + */ + exec_nodes->nodelist = GetAnyDataNode(); + } + } + + cstate->hash_idx = -1; + if (hash_att) + { + List *attnums; + ListCell *cur; + + attnums = CopyGetAttnums(tupDesc, cstate->rel, attnamelist); + foreach(cur, attnums) + { + int attnum = lfirst_int(cur); + if (namestrcmp(&(tupDesc->attrs[attnum - 1]->attname), hash_att) == 0) + { + cstate->hash_idx = attnum - 1; + break; + } + } + } + + /* + * Build up query string for the data nodes, it should match + * to original string, but should have STDIN/STDOUT instead + * of filename. + */ + initStringInfo(&cstate->query_buf); + + appendStringInfoString(&cstate->query_buf, "COPY "); + appendStringInfo(&cstate->query_buf, "%s", RelationGetRelationName(cstate->rel)); + + if (attnamelist) + { + ListCell *cell; + ListCell *prev = NULL; + appendStringInfoString(&cstate->query_buf, " ("); + foreach (cell, attnamelist) + { + if (prev) + appendStringInfoString(&cstate->query_buf, ", "); + CopyQuoteIdentifier(&cstate->query_buf, strVal(lfirst(cell))); + prev = cell; + } + appendStringInfoChar(&cstate->query_buf, ')'); + } + + if (is_from) + appendStringInfoString(&cstate->query_buf, " FROM STDIN"); + else + appendStringInfoString(&cstate->query_buf, " TO STDOUT"); + + + if (cstate->binary) + appendStringInfoString(&cstate->query_buf, " BINARY"); + + if (cstate->oids) + appendStringInfoString(&cstate->query_buf, " OIDS"); + + if (cstate->delim) + if ((!cstate->csv_mode && cstate->delim[0] != '\t') + || (cstate->csv_mode && cstate->delim[0] != ',')) + { + appendStringInfoString(&cstate->query_buf, " DELIMITER AS "); + CopyQuoteStr(&cstate->query_buf, cstate->delim); + } + + if (cstate->null_print) + if ((!cstate->csv_mode && strcmp(cstate->null_print, "\\N")) + || (cstate->csv_mode && strcmp(cstate->null_print, ""))) + { + appendStringInfoString(&cstate->query_buf, " NULL AS "); + CopyQuoteStr(&cstate->query_buf, cstate->null_print); + } + + if (cstate->csv_mode) + appendStringInfoString(&cstate->query_buf, " CSV"); + + /* + * Only rewrite the header part for COPY FROM, + * doing that for COPY TO results in multiple headers in output + */ + if (cstate->header_line && is_from) + appendStringInfoString(&cstate->query_buf, " HEADER"); + + if (cstate->quote && cstate->quote[0] == '"') + { + appendStringInfoString(&cstate->query_buf, " QUOTE AS "); + CopyQuoteStr(&cstate->query_buf, cstate->quote); + } + + if (cstate->escape && cstate->quote && cstate->escape[0] == cstate->quote[0]) + { + appendStringInfoString(&cstate->query_buf, " ESCAPE AS "); + CopyQuoteStr(&cstate->query_buf, cstate->escape); + } + + if (force_quote) + { + ListCell *cell; + ListCell *prev = NULL; + appendStringInfoString(&cstate->query_buf, " FORCE QUOTE "); + foreach (cell, force_quote) + { + if (prev) + appendStringInfoString(&cstate->query_buf, ", "); + CopyQuoteIdentifier(&cstate->query_buf, strVal(lfirst(cell))); + prev = cell; + } + } + + if (force_notnull) + { + ListCell *cell; + ListCell *prev = NULL; + appendStringInfoString(&cstate->query_buf, " FORCE NOT NULL "); + foreach (cell, force_notnull) + { + if (prev) + appendStringInfoString(&cstate->query_buf, ", "); + CopyQuoteIdentifier(&cstate->query_buf, strVal(lfirst(cell))); + prev = cell; + } + } + return exec_nodes; +} + +/* + * Use COPY for handling INSERT SELECT + * It may be a bit better to use binary mode here, but + * we have not implemented binary support for COPY yet. + * + * We borrow some code from CopyTo and DoCopy here. + * We do not refactor them so that it is later easier to remerge + * with vanilla PostgreSQL + */ +void +DoInsertSelectCopy(EState *estate, TupleTableSlot *slot) +{ + ExecNodes *exec_nodes; + HeapTuple tuple; + Datum *values; + bool *nulls; + Datum *hash_value = NULL; + MemoryContext oldcontext; + CopyState cstate; + + + Assert(IS_PGXC_COORDINATOR); + + /* See if we need to initialize COPY (first tuple) */ + if (estate->es_processed == 0) + { + ListCell *lc; + List *attnamelist = NIL; + ResultRelInfo *resultRelInfo = estate->es_result_relation_info; + Form_pg_attribute *attr; + + oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); + exec_nodes = makeNode(ExecNodes); + + /* + * We use the cstate struct here, though we do not need everything + * We will just use the properties we are interested in here. + */ + insertstate = (CopyStateData *) palloc0(sizeof(CopyStateData)); + cstate = insertstate; + + cstate->rowcontext = AllocSetContextCreate(CurrentMemoryContext, + "COPY TO", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + cstate->rel = resultRelInfo->ri_RelationDesc; + cstate->tupDesc = RelationGetDescr(cstate->rel); + + foreach(lc, estate->es_plannedstmt->planTree->targetlist) + { + TargetEntry *target = (TargetEntry *) lfirst(lc); + attnamelist = lappend(attnamelist, makeString(target->resname)); + } + cstate->attnumlist = CopyGetAttnums(cstate->tupDesc, cstate->rel, attnamelist); + cstate->null_print_client = cstate->null_print; /* default */ + + /* We use fe_msgbuf as a per-row buffer regardless of copy_dest */ + cstate->fe_msgbuf = makeStringInfo(); + attr = cstate->tupDesc->attrs; + + /* Get info about the columns we need to process. */ + cstate->out_functions = (FmgrInfo *) palloc(cstate->tupDesc->natts * sizeof(FmgrInfo)); + foreach(lc, cstate->attnumlist) + { + int attnum = lfirst_int(lc); + Oid out_func_oid; + bool isvarlena; + + if (cstate->binary) + getTypeBinaryOutputInfo(attr[attnum - 1]->atttypid, + &out_func_oid, + &isvarlena); + else + getTypeOutputInfo(attr[attnum - 1]->atttypid, + &out_func_oid, + &isvarlena); + fmgr_info(out_func_oid, &cstate->out_functions[attnum - 1]); + } + + /* Set defaults for omitted options */ + if (!cstate->delim) + cstate->delim = cstate->csv_mode ? "," : "\t"; + + if (!cstate->null_print) + cstate->null_print = cstate->csv_mode ? "" : "\\N"; + cstate->null_print_len = strlen(cstate->null_print); + + if (cstate->csv_mode) + { + if (!cstate->quote) + cstate->quote = "\""; + if (!cstate->escape) + cstate->escape = cstate->quote; + } + + exec_nodes = build_copy_statement(cstate, attnamelist, + cstate->tupDesc, true, NULL, NULL); + + cstate->connections = DataNodeCopyBegin(cstate->query_buf.data, + exec_nodes->nodelist, + GetActiveSnapshot(), + true); + + if (!cstate->connections) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Failed to initialize data nodes for COPY"))); + + cstate->copy_dest = COPY_BUFFER; + + MemoryContextSwitchTo(oldcontext); + } + cstate = insertstate; + + values = (Datum *) palloc(cstate->tupDesc->natts * sizeof(Datum)); + nulls = (bool *) palloc(cstate->tupDesc->natts * sizeof(bool)); + + /* Process Tuple */ + /* We need to format the line for sending to data nodes */ + tuple = ExecMaterializeSlot(slot); + + /* Deconstruct the tuple ... faster than repeated heap_getattr */ + heap_deform_tuple(tuple, cstate->tupDesc, values, nulls); + + /* Format the input tuple for sending */ + CopyOneRowTo(cstate, 0, values, nulls); + + /* Get hash partition column, if any */ + if (cstate->hash_idx >= 0 && !nulls[cstate->hash_idx]) + hash_value = &values[cstate->hash_idx]; + + /* Send item to the appropriate data node(s) (buffer) */ + if (DataNodeCopyIn(cstate->fe_msgbuf->data, + cstate->fe_msgbuf->len, + GetRelationNodes(cstate->rel_loc, (long *)hash_value, RELATION_ACCESS_WRITE), + cstate->connections)) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("Copy failed on a data node"))); + + resetStringInfo(cstate->fe_msgbuf); + estate->es_processed++; +} + +/* + * + */ +void +EndInsertSelectCopy(void) +{ + Assert(IS_PGXC_COORDINATOR); + + DataNodeCopyFinish( + insertstate->connections, + primary_data_node, + COMBINE_TYPE_NONE); + pfree(insertstate->connections); + MemoryContextDelete(insertstate->rowcontext); +} +#endif diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 86db1eb..bb30d32 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -61,6 +61,7 @@ #include "utils/tqual.h" #ifdef PGXC #include "pgxc/pgxc.h" +#include "commands/copy.h" #endif /* Hooks for plugins to get control in ExecutorStart/Run/End() */ @@ -1685,6 +1686,26 @@ lnext: ; break; case CMD_INSERT: +#ifdef PGXC + /* + * If we get here on the Coordinator, we may have INSERT SELECT + * To handle INSERT SELECT, we use COPY to send down the nodes + */ + if (IS_PGXC_COORDINATOR && IsA(planstate, ResultState)) + { + PG_TRY(); + { + DoInsertSelectCopy(estate, slot); + } + PG_CATCH(); + { + EndInsertSelectCopy(); + PG_RE_THROW(); + } + PG_END_TRY(); + } + else +#endif ExecInsert(slot, tupleid, planSlot, dest, estate); break; @@ -1712,6 +1733,12 @@ lnext: ; break; } +#ifdef PGXC + /* See if we need to close a COPY started for INSERT SELECT */ + if (IS_PGXC_COORDINATOR && operation == CMD_INSERT && IsA(planstate, ResultState)) + EndInsertSelectCopy(); +#endif + /* * Process AFTER EACH STATEMENT triggers */ diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a753e95..3b20297 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -630,6 +630,7 @@ static Plan * create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Plan *outer_plan, Plan *inner_plan) { NestLoop *nest_parent; + JoinReduceInfo join_info; if (!enable_remotejoin) return parent; @@ -638,10 +639,6 @@ create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Pla if (root->hasPseudoConstantQuals) return parent; - /* Works only for SELECT commands right now */ - if (root->parse->commandType != CMD_SELECT) - return parent; - /* do not optimize CURSOR based select statements */ if (root->parse->rowMarks != NIL) return parent; @@ -664,7 +661,6 @@ create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Pla { int i; List *rtable_list = NIL; - bool partitioned_replicated_join = false; Material *outer_mat = (Material *)outer_plan; Material *inner_mat = (Material *)inner_plan; @@ -692,7 +688,7 @@ create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Pla } /* XXX Check if the join optimization is possible */ - if (IsJoinReducible(inner, outer, rtable_list, best_path, &partitioned_replicated_join)) + if (IsJoinReducible(inner, outer, rtable_list, best_path, &join_info)) { RemoteQuery *result; Plan *result_plan; @@ -829,6 +825,7 @@ create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Pla result->outer_reduce_level = outer->reduce_level; result->inner_relids = in_relids; result->outer_relids = out_relids; + result->exec_nodes = copyObject(join_info.exec_nodes); appendStringInfo(&fromlist, " %s (%s) %s", pname, inner->sql_statement, quote_identifier(in_alias)); @@ -917,8 +914,7 @@ create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Pla /* set_plan_refs needs this later */ result->base_tlist = base_tlist; result->relname = "__FOREIGN_QUERY__"; - - result->partitioned_replicated = partitioned_replicated_join; + result->partitioned_replicated = join_info.partitioned_replicated; /* * if there were any local scan clauses stick them up here. They @@ -2233,6 +2229,8 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, TupleDesc tupdesc; bool first; StringInfoData sql; + RelationLocInfo *rel_loc_info; + Assert(scan_relid > 0); rte = planner_rt_fetch(scan_relid, root); @@ -2393,6 +2391,21 @@ create_remotequery_plan(PlannerInfo *root, Path *best_path, scan_plan->sql_statement = sql.data; + /* + * Populate what nodes we execute on. + * This is still basic, and was done to make sure we do not select + * a replicated table from all nodes. + * It does not take into account conditions on partitioned relations + * that could reduce to one node. To do that, we need to move general + * planning earlier. + */ + rel_loc_info = GetRelationLocInfo(rte->relid); + scan_plan->exec_nodes = makeNode(ExecNodes); + scan_plan->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_USER; + scan_plan->exec_nodes->baselocatortype = rel_loc_info->locatorType; + scan_plan->exec_nodes = GetRelationNodes(rel_loc_info, + NULL, + RELATION_ACCESS_READ); copy_path_costsize(&scan_plan->scan.plan, best_path); /* PGXCTODO - get better estimates */ diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 02b863a..4c677aa 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -441,35 +441,37 @@ get_base_var(Var *var, XCWalkerContext *context) /* * get_plan_nodes_insert - determine nodes on which to execute insert. + * + * We handle INSERT ... VALUES. + * If we have INSERT SELECT, we try and see if it is patitioned-based + * inserting into a partitioned-based. + * + * We set step->exec_nodes if we determine the single-step execution + * nodes. If it is still NULL after returning from this function, + * then the caller should use the regular PG planner */ -static ExecNodes * -get_plan_nodes_insert(Query *query) +static void +get_plan_nodes_insert(Query *query, RemoteQuery *step) { RangeTblEntry *rte; RelationLocInfo *rel_loc_info; Const *constant; - ExecNodes *exec_nodes; ListCell *lc; long part_value; long *part_value_ptr = NULL; Expr *eval_expr = NULL; - /* Looks complex (correlated?) - best to skip */ - if (query->jointree != NULL && query->jointree->fromlist != NULL) - return NULL; - /* Make sure there is just one table */ - if (query->rtable == NULL) - return NULL; + step->exec_nodes = NULL; rte = (RangeTblEntry *) list_nth(query->rtable, query->resultRelation - 1); - if (rte != NULL && rte->rtekind != RTE_RELATION) /* Bad relation type */ - return NULL; + return; - /* See if we have the partitioned case. */ + + /* Get result relation info */ rel_loc_info = GetRelationLocInfo(rte->relid); if (!rel_loc_info) @@ -477,13 +479,62 @@ get_plan_nodes_insert(Query *query) (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), (errmsg("Could not find relation for oid = %d", rte->relid)))); + if (query->jointree != NULL && query->jointree->fromlist != NULL) + { + /* INSERT SELECT suspected */ + + /* We only optimize for when the destination is partitioned */ + if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH) + return; + + /* + * See if it is "single-step" + * Optimize for just known common case with 2 RTE entries + */ + if (query->resultRelation == 1 && query->rtable->length == 2) + { + RangeTblEntry *sub_rte = list_nth(query->rtable, 1); + + /* + * Get step->exec_nodes for the SELECT part of INSERT-SELECT + * to see if it is single-step + */ + if (sub_rte->rtekind == RTE_SUBQUERY && + !sub_rte->subquery->limitCount && + !sub_rte->subquery->limitOffset) + get_plan_nodes(sub_rte->subquery, step, RELATION_ACCESS_READ); + } + + /* Send to general planner if the query is multiple step */ + if (!step->exec_nodes) + return; + + /* If the source is not hash-based (eg, replicated) also send + * through general planner + */ + if (step->exec_nodes->baselocatortype != LOCATOR_TYPE_HASH) + { + step->exec_nodes = NULL; + return; + } + + /* + * If step->exec_nodes is not null, it is single step. + * Continue and check for destination table type cases below + */ + } + + if (rel_loc_info->locatorType == LOCATOR_TYPE_HASH && rel_loc_info->partAttrName != NULL) { + Expr *checkexpr; + TargetEntry *tle = NULL; + /* It is a partitioned table, get value by looking in targetList */ foreach(lc, query->targetList) { - TargetEntry *tle = (TargetEntry *) lfirst(lc); + tle = (TargetEntry *) lfirst(lc); if (tle->resjunk) continue; @@ -493,46 +544,95 @@ get_plan_nodes_insert(Query *query) * designated partitioned column */ if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0) + break; + } + + if (!lc) + { + /* give up */ + step->exec_nodes = NULL; + return; + } + + /* We found the TargetEntry for the partition column */ + checkexpr = tle->expr; + + /* Handle INSERT SELECT case */ + if (query->jointree != NULL && query->jointree->fromlist != NULL) + { + if (IsA(checkexpr,Var)) { - /* We may have a cast, try and handle it */ - Expr *checkexpr = tle->expr; + XCWalkerContext context; + ColumnBase *col_base; + RelationLocInfo *source_rel_loc_info; - if (!IsA(tle->expr, Const)) + /* Look for expression populating partition column */ + InitXCWalkerContext(&context); + context.query = query; + context.rtables = lappend(context.rtables, query->rtable); + col_base = get_base_var((Var*) checkexpr, &context); + + if (!col_base) { - eval_expr = (Expr *) eval_const_expressions(NULL, (Node *) tle->expr); - checkexpr = get_numeric_constant(eval_expr); + step->exec_nodes = NULL; + return; } - if (checkexpr == NULL) - break; /* no constant */ + /* See if it is also a partitioned table */ + source_rel_loc_info = GetRelationLocInfo(col_base->relid); - constant = (Const *) checkexpr; + if (!source_rel_loc_info) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + (errmsg("Could not find relation for oid = %d", rte->relid)))); - if (constant->consttype == INT4OID || - constant->consttype == INT2OID || - constant->consttype == INT8OID) + if (source_rel_loc_info->locatorType == LOCATOR_TYPE_HASH && + strcmp(col_base->colname, source_rel_loc_info->partAttrName) == 0) { - part_value = (long) constant->constvalue; - part_value_ptr = &part_value; - + /* + * Partition columns match, we have a "single-step INSERT SELECT". + * It is OK to use step->exec_nodes + */ + return; } - /* PGXCTODO - handle other data types */ - /* - else - if (constant->consttype == VARCHAR ... - */ } + /* Multi-step INSERT SELECT or some other case. Use general planner */ + step->exec_nodes = NULL; + return; + } + else + { + /* Check for constant */ + + /* We may have a cast, try and handle it */ + if (!IsA(tle->expr, Const)) + { + eval_expr = (Expr *) eval_const_expressions(NULL, (Node *) tle->expr); + checkexpr = get_numeric_constant(eval_expr); + } + + if (checkexpr == NULL) + return; /* no constant */ + + constant = (Const *) checkexpr; + + if (constant->consttype == INT4OID || + constant->consttype == INT2OID || + constant->consttype == INT8OID) + { + part_value = (long) constant->constvalue; + part_value_ptr = &part_value; + } + /* PGXCTODO - handle other data types */ } } /* single call handles both replicated and partitioned types */ - exec_nodes = GetRelationNodes(rel_loc_info, part_value_ptr, + step->exec_nodes = GetRelationNodes(rel_loc_info, part_value_ptr, RELATION_ACCESS_WRITE); if (eval_expr) pfree(eval_expr); - - return exec_nodes; } @@ -1665,7 +1765,7 @@ get_plan_nodes_command(Query *query, RemoteQuery *step) break; case CMD_INSERT: - step->exec_nodes = get_plan_nodes_insert(query); + get_plan_nodes_insert(query, step); break; case CMD_UPDATE: @@ -2794,24 +2894,28 @@ free_query_step(RemoteQuery *query_step) * If the join between the two RemoteQuery nodes is partitioned - partitioned * it is always reducibile safely, * - * RemoteQuery *innernode - the inner node - * RemoteQuery *outernode - the outer node - * bool *partitioned_replicated - set to true if we have a partitioned-replicated + * RemoteQuery *innernode - the inner node + * RemoteQuery *outernode - the outer node + * List *rtable_list - rtables + * JoinPath *join_path - used to examine join restrictions + * PGXCJoinInfo *join_info - contains info about the join reduction + * join_info->partitioned_replicated is set to true if we have a partitioned-replicated * join. We want to use replicated tables with non-replicated * tables ony once. Only use this value if this function * returns true. */ bool IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, - List *rtable_list, JoinPath *join_path, bool *partitioned_replicated) + List *rtable_list, JoinPath *join_path, JoinReduceInfo *join_info) { XCWalkerContext context; ListCell *cell; bool maybe_reducible = false; bool result = false; - - *partitioned_replicated = false; + Assert(join_info); + join_info->partitioned_replicated = false; + join_info->exec_nodes = NULL; InitXCWalkerContext(&context); context.accessType = RELATION_ACCESS_READ; /* PGXCTODO - determine */ @@ -2819,7 +2923,6 @@ IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, context.rtables = lappend(context.rtables, rtable_list); /* add to list of lists */ - foreach(cell, join_path->joinrestrictinfo) { RestrictInfo *node = (RestrictInfo *) lfirst(cell); @@ -2874,7 +2977,7 @@ IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, if (pgxc_join->join_type == JOIN_REPLICATED_PARTITIONED) { - *partitioned_replicated = true; + join_info->partitioned_replicated = true; /* * If either of these already have such a join, we do not @@ -2891,6 +2994,18 @@ IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, } } + if (result) + { + /* + * Set exec_nodes from walker if it was set. + * If not, it is replicated and we can use existing + */ + if (context.query_step) + join_info->exec_nodes = copyObject(context.query_step->exec_nodes); + else + join_info->exec_nodes = copyObject(outernode->exec_nodes); + } + return result; } diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 7284ef7..bb1f934 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -168,6 +168,12 @@ typedef struct StringInfoData valuebuf; } SimpleAgg; +typedef struct +{ + bool partitioned_replicated; + ExecNodes *exec_nodes; +} JoinReduceInfo; + /* forbid SQL if unsafe, useful to turn off for development */ extern bool StrictStatementChecking; @@ -181,6 +187,6 @@ extern bool IsHashDistributable(Oid col_type); extern bool is_immutable_func(Oid funcid); extern bool IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, - List *rtable_list, JoinPath *join_path, bool *partitioned_replicated); + List *rtable_list, JoinPath *join_path, JoinReduceInfo *join_info); #endif /* PGXCPLANNER_H */ ----------------------------------------------------------------------- Summary of changes: src/backend/commands/copy.c | 495 ++++++++++++++++++++++--------- src/backend/executor/execMain.c | 27 ++ src/backend/optimizer/plan/createplan.c | 29 ++- src/backend/pgxc/plan/planner.c | 203 ++++++++++--- src/include/pgxc/planner.h | 8 +- 5 files changed, 562 insertions(+), 200 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-12-09 16:44:00
|
Project "Postgres-XC". The branch, master has been updated via 88a8a1f4ada3b76dd075b446c392789901ab4958 (commit) from 6d5e89eb5a30fd6f3e5256e40e3376ee8063f93e (commit) - Log ----------------------------------------------------------------- commit 88a8a1f4ada3b76dd075b446c392789901ab4958 Author: Mason Sharp <ma...@us...> Date: Thu Dec 9 11:42:58 2010 -0500 This fixes a couple of issues noticed after the last commit, which added checking for expressions in the SELECT clause. Fxied a bug with expression checking because of context recursion depth. Also, we now allow built-in volatile expressions in statements to be pushed down. Without this, having NOW() in DML was causing a problem in DBT-1. We should go back and examine this more carefully to make sure the built-in ones are all safe. Also, we still make the false assumption that immutable functions will not write to the database. This could be addressed by either a more thorough analysis of functions at creation time or additional tags at creation time (preferably in standard PostgreSQL). Ideally such functionality could be committed back to standard PostgreSQL. diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 83ee829..02b863a 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -150,6 +150,8 @@ static bool examine_conditions_walker(Node *expr_node, XCWalkerContext *context) static int handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stmt); static void InitXCWalkerContext(XCWalkerContext *context); static void validate_part_col_updatable(const Query *query); +static bool is_pgxc_safe_func(Oid funcid); + /* * Find position of specified substring in the string @@ -850,7 +852,6 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) return false; } } - else if (IsA(expr_node, BoolExpr)) { BoolExpr *boolexpr = (BoolExpr *) expr_node; @@ -1082,7 +1083,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) /* See if the function is immutable, otherwise give up */ if (IsA(expr_node, FuncExpr)) { - if (!is_immutable_func(((FuncExpr*) expr_node)->funcid)) + if (!is_pgxc_safe_func(((FuncExpr*) expr_node)->funcid)) return true; } @@ -1273,10 +1274,16 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) */ if (query->targetList) { + ExecNodes *save_nodes = context->query_step->exec_nodes; + int save_varno = context->varno; + foreach(item, query->targetList) { TargetEntry *target = (TargetEntry *) lfirst(item); + context->query_step->exec_nodes = NULL; + context->varno = 0; + if (examine_conditions_walker((Node*)target->expr, context)) return true; @@ -1290,11 +1297,11 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) return true; pfree(context->query_step->exec_nodes); - context->query_step->exec_nodes = NULL; } } + context->query_step->exec_nodes = save_nodes; + context->varno = save_varno; } - /* Look for JOIN syntax joins */ foreach(item, query->jointree->fromlist) { @@ -2554,7 +2561,7 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) { ereport(ERROR, (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), - (errmsg("Complex and correlated UPDATE and DELETE not yet supported")))); + (errmsg("UPDATE and DELETE that are correlated or use non-immutable functions not yet supported")))); } /* @@ -2940,3 +2947,62 @@ validate_part_col_updatable(const Query *query) } } } + +/* + * See if it is safe to use this function in single step. + * + * Based on is_immutable_func from postgresql_fdw.c + * We add an exeption for base postgresql functions, to + * allow now() and others to still execute as part of single step + * queries. + * + * PGXCTODO - we currently make the false assumption that immutable + * functions will not write to the database. This could be addressed + * by either a more thorough analysis of functions at + * creation time or additional tags at creation time (preferably + * in standard PostgreSQL). Ideally such functionality could be + * committed back to standard PostgreSQL. + */ +bool +is_pgxc_safe_func(Oid funcid) +{ + HeapTuple tp; + bool isnull; + Datum datum; + bool ret_val = false; + + tp = SearchSysCache(PROCOID, ObjectIdGetDatum(funcid), 0, 0, 0); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for function %u", funcid); + +#ifdef DEBUG_FDW + /* print function name and its immutability */ + { + char *proname; + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_proname, &isnull); + proname = pstrdup(DatumGetName(datum)->data); + elog(DEBUG1, "func %s(%u) is%s immutable", proname, funcid, + (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE) ? "" : " not"); + pfree(proname); + } +#endif + + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_provolatile, &isnull); + + if (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE) + ret_val = true; + /* + * Also allow stable and volatile ones that are in the PG_CATALOG_NAMESPACE + * this allows now() and others that do not update the database + * PGXCTODO - examine default functions carefully for those that may + * write to the database. + */ + else + { + datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_pronamespace, &isnull); + if (DatumGetObjectId(datum) == PG_CATALOG_NAMESPACE) + ret_val = true; + } + ReleaseSysCache(tp); + return ret_val; +} ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/plan/planner.c | 76 ++++++++++++++++++++++++++++++++++++--- 1 files changed, 71 insertions(+), 5 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2010-12-09 01:12:56
|
Project "Postgres-XC". The branch, PGXC-sqlmed has been deleted was eb50a76cb929fbe4a31d093b43e1589382c892a0 ----------------------------------------------------------------------- eb50a76cb929fbe4a31d093b43e1589382c892a0 Set remote relation stats (pages, rows etc) to a lower value so that NestLoop joins are preferred over other join types. This is necessary until we can handle other join types for remote join reduction ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Pavan D. <pa...@us...> - 2010-12-02 06:57:20
|
Project "Postgres-XC". The branch, master has been updated via 6d5e89eb5a30fd6f3e5256e40e3376ee8063f93e (commit) from c66ed018bf6e7295c576286bba275af109b4bcb9 (commit) - Log ----------------------------------------------------------------- commit 6d5e89eb5a30fd6f3e5256e40e3376ee8063f93e Author: Pavan Deolasee <pav...@gm...> Date: Thu Dec 2 12:21:42 2010 +0530 Check for buffer overflow while constructing gtm/gtm_proxy start/stop commands. In passing, also fix another bug where an uninitialized var was being used. Bug report and patch by Xiong Wang (Benny) with some tweaks by me diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c index 3b01796..46d9364 100644 --- a/src/gtm/gtm_ctl/gtm_ctl.c +++ b/src/gtm/gtm_ctl/gtm_ctl.c @@ -246,26 +246,52 @@ static int start_gtm(void) { char cmd[MAXPGPATH]; + char gtm_app_path[MAXPGPATH]; + int len; + /* * Since there might be quotes to handle here, it is easier simply to pass * everything to a shell to process them. */ + memset(gtm_app_path, 0, MAXPGPATH); + memset(cmd, 0, MAXPGPATH); + + /* + * Construct gtm binary path. We should leave one byte at the end for '\0' + */ + len = 0; if (gtm_path != NULL) { - strcat(gtm_path, "/"); - strcat(gtm_path, gtm_app); + strncpy(gtm_app_path, gtm_path, MAXPGPATH - len - 1); + + len = strlen(gtm_app_path); + strncat(gtm_app_path, "/", MAXPGPATH - len - 1); + + len = strlen(gtm_app_path); } - else - gtm_path = gtm_app; + + if (strlen(gtm_app) >= (MAXPGPATH - len - 1)) + { + write_stderr("gtm command exceeds max size"); + exit(1); + } + + strncat(gtm_app_path, gtm_app, MAXPGPATH - len - 1); if (log_file != NULL) - snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE, - gtm_path, gtmdata_opt, gtm_opts, + len = snprintf(cmd, MAXPGPATH - 1, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE, + gtm_app_path, gtmdata_opt, gtm_opts, DEVNULL, log_file); else - snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE, - gtm_path, gtmdata_opt, gtm_opts, DEVNULL); + len = snprintf(cmd, MAXPGPATH - 1, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE, + gtm_app_path, gtmdata_opt, gtm_opts, DEVNULL); + + if (len >= MAXPGPATH - 1) + { + write_stderr("gtm command exceeds max size"); + exit(1); + } return system(cmd); } @@ -376,14 +402,13 @@ read_gtm_opts(void) { int len; char *optline; - char *arg1; optline = optlines[0]; /* trim off line endings */ len = strcspn(optline, "\r\n"); optline[len] = '\0'; - gtm_opts = arg1; + gtm_opts = optline; } } } diff --git a/src/gtm/libpq/pqformat.c b/src/gtm/libpq/pqformat.c index 339f50a..41ef105 100644 --- a/src/gtm/libpq/pqformat.c +++ b/src/gtm/libpq/pqformat.c @@ -134,20 +134,9 @@ pq_sendcountedtext(StringInfo buf, const char *str, int slen, bool countincludesself) { int extra = countincludesself ? 4 : 0; - char *p; - if (p != str) /* actual conversion has been done? */ - { - slen = strlen(p); - pq_sendint(buf, slen + extra, 4); - appendBinaryStringInfo(buf, p, slen); - pfree(p); - } - else - { - pq_sendint(buf, slen + extra, 4); - appendBinaryStringInfo(buf, str, slen); - } + pq_sendint(buf, slen + extra, 4); + appendBinaryStringInfo(buf, str, slen); } /* -------------------------------- @@ -163,16 +152,7 @@ pq_sendcountedtext(StringInfo buf, const char *str, int slen, void pq_sendtext(StringInfo buf, const char *str, int slen) { - char *p; - - if (p != str) /* actual conversion has been done? */ - { - slen = strlen(p); - appendBinaryStringInfo(buf, p, slen); - pfree(p); - } - else - appendBinaryStringInfo(buf, str, slen); + appendBinaryStringInfo(buf, str, slen); } /* -------------------------------- ----------------------------------------------------------------------- Summary of changes: src/gtm/gtm_ctl/gtm_ctl.c | 45 +++++++++++++++++++++++++++++++++++---------- src/gtm/libpq/pqformat.c | 26 +++----------------------- 2 files changed, 38 insertions(+), 33 deletions(-) hooks/post-receive -- Postgres-XC |