summaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/network.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/network.c')
-rw-r--r--src/backend/utils/adt/network.c399
1 files changed, 399 insertions, 0 deletions
diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c
index 5e980cf23f0..6b367644f90 100644
--- a/src/backend/utils/adt/network.c
+++ b/src/backend/utils/adt/network.c
@@ -16,6 +16,7 @@
#include "catalog/pg_opfamily.h"
#include "catalog/pg_type.h"
#include "common/ip.h"
+#include "lib/hyperloglog.h"
#include "libpq/libpq-be.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
@@ -24,12 +25,37 @@
#include "nodes/supportnodes.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
+#include "utils/guc.h"
#include "utils/hashutils.h"
#include "utils/inet.h"
#include "utils/lsyscache.h"
+#include "utils/sortsupport.h"
+/*
+ * An IPv4 netmask size is a value in the range of 0 - 32, which is
+ * represented with 6 bits in inet/cidr abbreviated keys where possible.
+ *
+ * An IPv4 inet/cidr abbreviated key can use up to 25 bits for subnet
+ * component.
+ */
+#define ABBREV_BITS_INET4_NETMASK_SIZE 6
+#define ABBREV_BITS_INET4_SUBNET 25
+
+/* sortsupport for inet/cidr */
+typedef struct
+{
+ int64 input_count; /* number of non-null values seen */
+ bool estimating; /* true if estimating cardinality */
+
+ hyperLogLogState abbr_card; /* cardinality estimator */
+} network_sortsupport_state;
+
static int32 network_cmp_internal(inet *a1, inet *a2);
+static int network_fast_cmp(Datum x, Datum y, SortSupport ssup);
+static int network_cmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static bool network_abbrev_abort(int memtupcount, SortSupport ssup);
+static Datum network_abbrev_convert(Datum original, SortSupport ssup);
static List *match_network_function(Node *leftop,
Node *rightop,
int indexarg,
@@ -406,6 +432,379 @@ network_cmp(PG_FUNCTION_ARGS)
}
/*
+ * SortSupport strategy routine
+ */
+Datum
+network_sortsupport(PG_FUNCTION_ARGS)
+{
+ SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+ ssup->comparator = network_fast_cmp;
+ ssup->ssup_extra = NULL;
+
+ if (ssup->abbreviate)
+ {
+ network_sortsupport_state *uss;
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
+
+ uss = palloc(sizeof(network_sortsupport_state));
+ uss->input_count = 0;
+ uss->estimating = true;
+ initHyperLogLog(&uss->abbr_card, 10);
+
+ ssup->ssup_extra = uss;
+
+ ssup->comparator = network_cmp_abbrev;
+ ssup->abbrev_converter = network_abbrev_convert;
+ ssup->abbrev_abort = network_abbrev_abort;
+ ssup->abbrev_full_comparator = network_fast_cmp;
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SortSupport comparison func
+ */
+static int
+network_fast_cmp(Datum x, Datum y, SortSupport ssup)
+{
+ inet *arg1 = DatumGetInetPP(x);
+ inet *arg2 = DatumGetInetPP(y);
+
+ return network_cmp_internal(arg1, arg2);
+}
+
+/*
+ * Abbreviated key comparison func
+ */
+static int
+network_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+ if (x > y)
+ return 1;
+ else if (x == y)
+ return 0;
+ else
+ return -1;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization.
+ *
+ * We pay no attention to the cardinality of the non-abbreviated data, because
+ * there is no equality fast-path within authoritative inet comparator.
+ */
+static bool
+network_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+ network_sortsupport_state *uss = ssup->ssup_extra;
+ double abbr_card;
+
+ if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating)
+ return false;
+
+ abbr_card = estimateHyperLogLog(&uss->abbr_card);
+
+ /*
+ * If we have >100k distinct values, then even if we were sorting many
+ * billion rows we'd likely still break even, and the penalty of undoing
+ * that many rows of abbrevs would probably not be worth it. At this point
+ * we stop counting because we know that we're now fully committed.
+ */
+ if (abbr_card > 100000.0)
+ {
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG,
+ "network_abbrev: estimation ends at cardinality %f"
+ " after " INT64_FORMAT " values (%d rows)",
+ abbr_card, uss->input_count, memtupcount);
+#endif
+ uss->estimating = false;
+ return false;
+ }
+
+ /*
+ * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row
+ * fudge factor allows us to abort earlier on genuinely pathological data
+ * where we've had exactly one abbreviated value in the first 2k
+ * (non-null) rows.
+ */
+ if (abbr_card < uss->input_count / 2000.0 + 0.5)
+ {
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG,
+ "network_abbrev: aborting abbreviation at cardinality %f"
+ " below threshold %f after " INT64_FORMAT " values (%d rows)",
+ abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count,
+ memtupcount);
+#endif
+ return true;
+ }
+
+#ifdef TRACE_SORT
+ if (trace_sort)
+ elog(LOG,
+ "network_abbrev: cardinality %f after " INT64_FORMAT
+ " values (%d rows)", abbr_card, uss->input_count, memtupcount);
+#endif
+
+ return false;
+}
+
+/*
+ * SortSupport conversion routine. Converts original inet/cidr representation
+ * to abbreviated key representation that works with simple 3-way unsigned int
+ * comparisons. The network_cmp_internal() rules for sorting inet/cidr datums
+ * are followed by abbreviated comparisons by an encoding scheme that
+ * conditions keys through careful use of padding.
+ *
+ * Some background: inet values have three major components (take for example
+ * the address 1.2.3.4/24):
+ *
+ * * A network, or netmasked bits (1.2.3.0).
+ * * A netmask size (/24).
+ * * A subnet, or bits outside of the netmask (0.0.0.4).
+ *
+ * cidr values are the same except that with only the first two components --
+ * all their subnet bits *must* be zero (1.2.3.0/24).
+ *
+ * IPv4 and IPv6 are identical in this makeup, with the difference being that
+ * IPv4 addresses have a maximum of 32 bits compared to IPv6's 64 bits, so in
+ * IPv6 each part may be larger.
+ *
+ * inet/cdir types compare using these sorting rules. If inequality is detected
+ * at any step, comparison is finished. If any rule is a tie, the algorithm
+ * drops through to the next to break it:
+ *
+ * 1. IPv4 always appears before IPv6.
+ * 2. Network bits are compared.
+ * 3. Netmask size is compared.
+ * 4. All bits are compared (having made it here, we know that both
+ * netmasked bits and netmask size are equal, so we're in effect only
+ * comparing subnet bits).
+ *
+ * When generating abbreviated keys for SortSupport, we pack as much as we can
+ * into a datum while ensuring that when comparing those keys as integers,
+ * these rules will be respected. Exact contents depend on IP family and datum
+ * size.
+ *
+ * IPv4
+ * ----
+ *
+ * 4 byte datums:
+ *
+ * Start with 1 bit for the IP family (IPv4 or IPv6; this bit is present in
+ * every case below) followed by all but 1 of the netmasked bits.
+ *
+ * +----------+---------------------+
+ * | 1 bit IP | 31 bits network | (1 bit network
+ * | family | (truncated) | omitted)
+ * +----------+---------------------+
+ *
+ * 8 byte datums:
+ *
+ * We have space to store all netmasked bits, followed by the netmask size,
+ * followed by 25 bits of the subnet (25 bits is usually more than enough in
+ * practice). cidr datums always have all-zero subnet bits.
+ *
+ * +----------+-----------------------+--------------+--------------------+
+ * | 1 bit IP | 32 bits network | 6 bits | 25 bits subnet |
+ * | family | (full) | network size | (truncated) |
+ * +----------+-----------------------+--------------+--------------------+
+ *
+ * IPv6
+ * ----
+ *
+ * 4 byte datums:
+ *
+ * +----------+---------------------+
+ * | 1 bit IP | 31 bits network | (up to 97 bits
+ * | family | (truncated) | network omitted)
+ * +----------+---------------------+
+ *
+ * 8 byte datums:
+ *
+ * +----------+---------------------------------+
+ * | 1 bit IP | 63 bits network | (up to 65 bits
+ * | family | (truncated) | network omitted)
+ * +----------+---------------------------------+
+ */
+static Datum
+network_abbrev_convert(Datum original, SortSupport ssup)
+{
+ network_sortsupport_state *uss = ssup->ssup_extra;
+ inet *authoritative = DatumGetInetPP(original);
+ Datum res,
+ ipaddr_datum,
+ subnet_bitmask,
+ network;
+ int subnet_size;
+
+ Assert(ip_family(authoritative) == PGSQL_AF_INET ||
+ ip_family(authoritative) == PGSQL_AF_INET6);
+
+ /*
+ * Get an unsigned integer representation of the IP address by taking its
+ * first 4 or 8 bytes. Always take all 4 bytes of an IPv4 address. Take
+ * the first 8 bytes of an IPv6 address with an 8 byte datum and 4 bytes
+ * otherwise.
+ *
+ * We're consuming an array of unsigned char, so byteswap on little endian
+ * systems (an inet's ipaddr field stores the most significant byte
+ * first).
+ */
+ if (ip_family(authoritative) == PGSQL_AF_INET)
+ {
+ uint32 ipaddr_datum32;
+
+ memcpy(&ipaddr_datum32, ip_addr(authoritative), sizeof(uint32));
+
+ /* Must byteswap on little-endian machines */
+#ifndef WORDS_BIGENDIAN
+ ipaddr_datum = pg_bswap32(ipaddr_datum32);
+#else
+ ipaddr_datum = ipaddr_datum32;
+#endif
+
+ /* Initialize result without setting ipfamily bit */
+ res = (Datum) 0;
+ }
+ else
+ {
+ memcpy(&ipaddr_datum, ip_addr(authoritative), sizeof(Datum));
+
+ /* Must byteswap on little-endian machines */
+ ipaddr_datum = DatumBigEndianToNative(ipaddr_datum);
+
+ /* Initialize result with ipfamily (most significant) bit set */
+ res = ((Datum) 1) << (SIZEOF_DATUM * BITS_PER_BYTE - 1);
+ }
+
+ /*
+ * ipaddr_datum must be "split": high order bits go in "network" component
+ * of abbreviated key (often with zeroed bits at the end due to masking),
+ * while low order bits go in "subnet" component when there is space for
+ * one. This is often accomplished by generating a temp datum subnet
+ * bitmask, which we may reuse later when generating the subnet bits
+ * themselves. (Note that subnet bits are only used with IPv4 datums on
+ * platforms where datum is 8 bytes.)
+ *
+ * The number of bits in subnet is used to generate a datum subnet
+ * bitmask. For example, with a /24 IPv4 datum there are 8 subnet bits
+ * (since 32 - 24 is 8), so the final subnet bitmask is B'1111 1111'. We
+ * need explicit handling for cases where the ipaddr bits cannot all fit
+ * in a datum, though (otherwise we'd incorrectly mask the network
+ * component with IPv6 values).
+ */
+ subnet_size = ip_maxbits(authoritative) - ip_bits(authoritative);
+ Assert(subnet_size >= 0);
+ /* subnet size must work with prefix ipaddr cases */
+ subnet_size %= SIZEOF_DATUM * BITS_PER_BYTE;
+ if (ip_bits(authoritative) == 0)
+ {
+ /* Fit as many ipaddr bits as possible into subnet */
+ subnet_bitmask = ((Datum) 0) - 1;
+ network = 0;
+ }
+ else if (ip_bits(authoritative) < SIZEOF_DATUM * BITS_PER_BYTE)
+ {
+ /* Split ipaddr bits between network and subnet */
+ subnet_bitmask = (((Datum) 1) << subnet_size) - 1;
+ network = ipaddr_datum & ~subnet_bitmask;
+ }
+ else
+ {
+ /* Fit as many ipaddr bits as possible into network */
+ subnet_bitmask = 0;
+ network = ipaddr_datum;
+ }
+
+#if SIZEOF_DATUM == 8
+ if (ip_family(authoritative) == PGSQL_AF_INET)
+ {
+ /*
+ * IPv4 with 8 byte datums: keep all 32 netmasked bits, netmask size,
+ * and most significant 25 subnet bits
+ */
+ Datum netmask_size = (Datum) ip_bits(authoritative);
+ Datum subnet;
+
+ /*
+ * Shift left 31 bits: 6 bits netmask size + 25 subnet bits.
+ *
+ * We don't make any distinction between network bits that are zero
+ * due to masking and "true"/non-masked zero bits. An abbreviated
+ * comparison that is resolved by comparing a non-masked and non-zero
+ * bit to a masked/zeroed bit is effectively resolved based on
+ * ip_bits(), even though the comparison won't reach the netmask_size
+ * bits.
+ */
+ network <<= (ABBREV_BITS_INET4_NETMASK_SIZE +
+ ABBREV_BITS_INET4_SUBNET);
+
+ /* Shift size to make room for subnet bits at the end */
+ netmask_size <<= ABBREV_BITS_INET4_SUBNET;
+
+ /* Extract subnet bits without shifting them */
+ subnet = ipaddr_datum & subnet_bitmask;
+
+ /*
+ * If we have more than 25 subnet bits, we can't fit everything. Shift
+ * subnet down to avoid clobbering bits that are only supposed to be
+ * used for netmask_size.
+ *
+ * Discarding the least significant subnet bits like this is correct
+ * because abbreviated comparisons that are resolved at the subnet
+ * level must have had equal netmask_size/ip_bits() values in order to
+ * get that far.
+ */
+ if (subnet_size > ABBREV_BITS_INET4_SUBNET)
+ subnet >>= subnet_size - ABBREV_BITS_INET4_SUBNET;
+
+ /*
+ * Assemble the final abbreviated key without clobbering the ipfamily
+ * bit that must remain a zero.
+ */
+ res |= network | netmask_size | subnet;
+ }
+ else
+#endif
+ {
+ /*
+ * 4 byte datums, or IPv6 with 8 byte datums: Use as many of the
+ * netmasked bits as will fit in final abbreviated key. Avoid
+ * clobbering the ipfamily bit that was set earlier.
+ */
+ res |= network >> 1;
+ }
+
+ uss->input_count += 1;
+
+ /* Hash abbreviated key */
+ if (uss->estimating)
+ {
+ uint32 tmp;
+
+#if SIZEOF_DATUM == 8
+ tmp = (uint32) res ^ (uint32) ((uint64) res >> 32);
+#else /* SIZEOF_DATUM != 8 */
+ tmp = (uint32) res;
+#endif
+
+ addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+ }
+
+ return res;
+}
+
+/*
* Boolean ordering tests.
*/
Datum