diff options
Diffstat (limited to 'src/backend/utils/adt/network.c')
-rw-r--r-- | src/backend/utils/adt/network.c | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index 5e980cf23f0..6b367644f90 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -16,6 +16,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "common/ip.h" +#include "lib/hyperloglog.h" #include "libpq/libpq-be.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -24,12 +25,37 @@ #include "nodes/supportnodes.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/guc.h" #include "utils/hashutils.h" #include "utils/inet.h" #include "utils/lsyscache.h" +#include "utils/sortsupport.h" +/* + * An IPv4 netmask size is a value in the range of 0 - 32, which is + * represented with 6 bits in inet/cidr abbreviated keys where possible. + * + * An IPv4 inet/cidr abbreviated key can use up to 25 bits for subnet + * component. + */ +#define ABBREV_BITS_INET4_NETMASK_SIZE 6 +#define ABBREV_BITS_INET4_SUBNET 25 + +/* sortsupport for inet/cidr */ +typedef struct +{ + int64 input_count; /* number of non-null values seen */ + bool estimating; /* true if estimating cardinality */ + + hyperLogLogState abbr_card; /* cardinality estimator */ +} network_sortsupport_state; + static int32 network_cmp_internal(inet *a1, inet *a2); +static int network_fast_cmp(Datum x, Datum y, SortSupport ssup); +static int network_cmp_abbrev(Datum x, Datum y, SortSupport ssup); +static bool network_abbrev_abort(int memtupcount, SortSupport ssup); +static Datum network_abbrev_convert(Datum original, SortSupport ssup); static List *match_network_function(Node *leftop, Node *rightop, int indexarg, @@ -406,6 +432,379 @@ network_cmp(PG_FUNCTION_ARGS) } /* + * SortSupport strategy routine + */ +Datum +network_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + + ssup->comparator = network_fast_cmp; + ssup->ssup_extra = NULL; + + if (ssup->abbreviate) + { + network_sortsupport_state *uss; + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + uss = palloc(sizeof(network_sortsupport_state)); + uss->input_count = 0; + uss->estimating = true; + initHyperLogLog(&uss->abbr_card, 10); + + ssup->ssup_extra = uss; + + ssup->comparator = network_cmp_abbrev; + ssup->abbrev_converter = network_abbrev_convert; + ssup->abbrev_abort = network_abbrev_abort; + ssup->abbrev_full_comparator = network_fast_cmp; + + MemoryContextSwitchTo(oldcontext); + } + + PG_RETURN_VOID(); +} + +/* + * SortSupport comparison func + */ +static int +network_fast_cmp(Datum x, Datum y, SortSupport ssup) +{ + inet *arg1 = DatumGetInetPP(x); + inet *arg2 = DatumGetInetPP(y); + + return network_cmp_internal(arg1, arg2); +} + +/* + * Abbreviated key comparison func + */ +static int +network_cmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + if (x > y) + return 1; + else if (x == y) + return 0; + else + return -1; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization. + * + * We pay no attention to the cardinality of the non-abbreviated data, because + * there is no equality fast-path within authoritative inet comparator. + */ +static bool +network_abbrev_abort(int memtupcount, SortSupport ssup) +{ + network_sortsupport_state *uss = ssup->ssup_extra; + double abbr_card; + + if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating) + return false; + + abbr_card = estimateHyperLogLog(&uss->abbr_card); + + /* + * If we have >100k distinct values, then even if we were sorting many + * billion rows we'd likely still break even, and the penalty of undoing + * that many rows of abbrevs would probably not be worth it. At this point + * we stop counting because we know that we're now fully committed. + */ + if (abbr_card > 100000.0) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: estimation ends at cardinality %f" + " after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count, memtupcount); +#endif + uss->estimating = false; + return false; + } + + /* + * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row + * fudge factor allows us to abort earlier on genuinely pathological data + * where we've had exactly one abbreviated value in the first 2k + * (non-null) rows. + */ + if (abbr_card < uss->input_count / 2000.0 + 0.5) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: aborting abbreviation at cardinality %f" + " below threshold %f after " INT64_FORMAT " values (%d rows)", + abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count, + memtupcount); +#endif + return true; + } + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "network_abbrev: cardinality %f after " INT64_FORMAT + " values (%d rows)", abbr_card, uss->input_count, memtupcount); +#endif + + return false; +} + +/* + * SortSupport conversion routine. Converts original inet/cidr representation + * to abbreviated key representation that works with simple 3-way unsigned int + * comparisons. The network_cmp_internal() rules for sorting inet/cidr datums + * are followed by abbreviated comparisons by an encoding scheme that + * conditions keys through careful use of padding. + * + * Some background: inet values have three major components (take for example + * the address 1.2.3.4/24): + * + * * A network, or netmasked bits (1.2.3.0). + * * A netmask size (/24). + * * A subnet, or bits outside of the netmask (0.0.0.4). + * + * cidr values are the same except that with only the first two components -- + * all their subnet bits *must* be zero (1.2.3.0/24). + * + * IPv4 and IPv6 are identical in this makeup, with the difference being that + * IPv4 addresses have a maximum of 32 bits compared to IPv6's 64 bits, so in + * IPv6 each part may be larger. + * + * inet/cdir types compare using these sorting rules. If inequality is detected + * at any step, comparison is finished. If any rule is a tie, the algorithm + * drops through to the next to break it: + * + * 1. IPv4 always appears before IPv6. + * 2. Network bits are compared. + * 3. Netmask size is compared. + * 4. All bits are compared (having made it here, we know that both + * netmasked bits and netmask size are equal, so we're in effect only + * comparing subnet bits). + * + * When generating abbreviated keys for SortSupport, we pack as much as we can + * into a datum while ensuring that when comparing those keys as integers, + * these rules will be respected. Exact contents depend on IP family and datum + * size. + * + * IPv4 + * ---- + * + * 4 byte datums: + * + * Start with 1 bit for the IP family (IPv4 or IPv6; this bit is present in + * every case below) followed by all but 1 of the netmasked bits. + * + * +----------+---------------------+ + * | 1 bit IP | 31 bits network | (1 bit network + * | family | (truncated) | omitted) + * +----------+---------------------+ + * + * 8 byte datums: + * + * We have space to store all netmasked bits, followed by the netmask size, + * followed by 25 bits of the subnet (25 bits is usually more than enough in + * practice). cidr datums always have all-zero subnet bits. + * + * +----------+-----------------------+--------------+--------------------+ + * | 1 bit IP | 32 bits network | 6 bits | 25 bits subnet | + * | family | (full) | network size | (truncated) | + * +----------+-----------------------+--------------+--------------------+ + * + * IPv6 + * ---- + * + * 4 byte datums: + * + * +----------+---------------------+ + * | 1 bit IP | 31 bits network | (up to 97 bits + * | family | (truncated) | network omitted) + * +----------+---------------------+ + * + * 8 byte datums: + * + * +----------+---------------------------------+ + * | 1 bit IP | 63 bits network | (up to 65 bits + * | family | (truncated) | network omitted) + * +----------+---------------------------------+ + */ +static Datum +network_abbrev_convert(Datum original, SortSupport ssup) +{ + network_sortsupport_state *uss = ssup->ssup_extra; + inet *authoritative = DatumGetInetPP(original); + Datum res, + ipaddr_datum, + subnet_bitmask, + network; + int subnet_size; + + Assert(ip_family(authoritative) == PGSQL_AF_INET || + ip_family(authoritative) == PGSQL_AF_INET6); + + /* + * Get an unsigned integer representation of the IP address by taking its + * first 4 or 8 bytes. Always take all 4 bytes of an IPv4 address. Take + * the first 8 bytes of an IPv6 address with an 8 byte datum and 4 bytes + * otherwise. + * + * We're consuming an array of unsigned char, so byteswap on little endian + * systems (an inet's ipaddr field stores the most significant byte + * first). + */ + if (ip_family(authoritative) == PGSQL_AF_INET) + { + uint32 ipaddr_datum32; + + memcpy(&ipaddr_datum32, ip_addr(authoritative), sizeof(uint32)); + + /* Must byteswap on little-endian machines */ +#ifndef WORDS_BIGENDIAN + ipaddr_datum = pg_bswap32(ipaddr_datum32); +#else + ipaddr_datum = ipaddr_datum32; +#endif + + /* Initialize result without setting ipfamily bit */ + res = (Datum) 0; + } + else + { + memcpy(&ipaddr_datum, ip_addr(authoritative), sizeof(Datum)); + + /* Must byteswap on little-endian machines */ + ipaddr_datum = DatumBigEndianToNative(ipaddr_datum); + + /* Initialize result with ipfamily (most significant) bit set */ + res = ((Datum) 1) << (SIZEOF_DATUM * BITS_PER_BYTE - 1); + } + + /* + * ipaddr_datum must be "split": high order bits go in "network" component + * of abbreviated key (often with zeroed bits at the end due to masking), + * while low order bits go in "subnet" component when there is space for + * one. This is often accomplished by generating a temp datum subnet + * bitmask, which we may reuse later when generating the subnet bits + * themselves. (Note that subnet bits are only used with IPv4 datums on + * platforms where datum is 8 bytes.) + * + * The number of bits in subnet is used to generate a datum subnet + * bitmask. For example, with a /24 IPv4 datum there are 8 subnet bits + * (since 32 - 24 is 8), so the final subnet bitmask is B'1111 1111'. We + * need explicit handling for cases where the ipaddr bits cannot all fit + * in a datum, though (otherwise we'd incorrectly mask the network + * component with IPv6 values). + */ + subnet_size = ip_maxbits(authoritative) - ip_bits(authoritative); + Assert(subnet_size >= 0); + /* subnet size must work with prefix ipaddr cases */ + subnet_size %= SIZEOF_DATUM * BITS_PER_BYTE; + if (ip_bits(authoritative) == 0) + { + /* Fit as many ipaddr bits as possible into subnet */ + subnet_bitmask = ((Datum) 0) - 1; + network = 0; + } + else if (ip_bits(authoritative) < SIZEOF_DATUM * BITS_PER_BYTE) + { + /* Split ipaddr bits between network and subnet */ + subnet_bitmask = (((Datum) 1) << subnet_size) - 1; + network = ipaddr_datum & ~subnet_bitmask; + } + else + { + /* Fit as many ipaddr bits as possible into network */ + subnet_bitmask = 0; + network = ipaddr_datum; + } + +#if SIZEOF_DATUM == 8 + if (ip_family(authoritative) == PGSQL_AF_INET) + { + /* + * IPv4 with 8 byte datums: keep all 32 netmasked bits, netmask size, + * and most significant 25 subnet bits + */ + Datum netmask_size = (Datum) ip_bits(authoritative); + Datum subnet; + + /* + * Shift left 31 bits: 6 bits netmask size + 25 subnet bits. + * + * We don't make any distinction between network bits that are zero + * due to masking and "true"/non-masked zero bits. An abbreviated + * comparison that is resolved by comparing a non-masked and non-zero + * bit to a masked/zeroed bit is effectively resolved based on + * ip_bits(), even though the comparison won't reach the netmask_size + * bits. + */ + network <<= (ABBREV_BITS_INET4_NETMASK_SIZE + + ABBREV_BITS_INET4_SUBNET); + + /* Shift size to make room for subnet bits at the end */ + netmask_size <<= ABBREV_BITS_INET4_SUBNET; + + /* Extract subnet bits without shifting them */ + subnet = ipaddr_datum & subnet_bitmask; + + /* + * If we have more than 25 subnet bits, we can't fit everything. Shift + * subnet down to avoid clobbering bits that are only supposed to be + * used for netmask_size. + * + * Discarding the least significant subnet bits like this is correct + * because abbreviated comparisons that are resolved at the subnet + * level must have had equal netmask_size/ip_bits() values in order to + * get that far. + */ + if (subnet_size > ABBREV_BITS_INET4_SUBNET) + subnet >>= subnet_size - ABBREV_BITS_INET4_SUBNET; + + /* + * Assemble the final abbreviated key without clobbering the ipfamily + * bit that must remain a zero. + */ + res |= network | netmask_size | subnet; + } + else +#endif + { + /* + * 4 byte datums, or IPv6 with 8 byte datums: Use as many of the + * netmasked bits as will fit in final abbreviated key. Avoid + * clobbering the ipfamily bit that was set earlier. + */ + res |= network >> 1; + } + + uss->input_count += 1; + + /* Hash abbreviated key */ + if (uss->estimating) + { + uint32 tmp; + +#if SIZEOF_DATUM == 8 + tmp = (uint32) res ^ (uint32) ((uint64) res >> 32); +#else /* SIZEOF_DATUM != 8 */ + tmp = (uint32) res; +#endif + + addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp))); + } + + return res; +} + +/* * Boolean ordering tests. */ Datum |