1 files changed, 399 insertions, 0 deletions
diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c
index 5e980cf23f0..6b367644f90 100644
--- a/src/backend/utils/adt/network.c
+++ b/src/backend/utils/adt/network.c
@@ -16,6 +16,7 @@
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_type.h"
 #include "common/ip.h"
+#include "lib/hyperloglog.h"
 #include "libpq/libpq-be.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
@@ -24,12 +25,37 @@
 #include "nodes/supportnodes.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/guc.h"
 #include "utils/hashutils.h"
 #include "utils/inet.h"
 #include "utils/lsyscache.h"
+#include "utils/sortsupport.h"
 
 
+/*
+ * An IPv4 netmask size is a value in the range of 0 - 32, which is
+ * represented with 6 bits in inet/cidr abbreviated keys where possible.
+ *
+ * An IPv4 inet/cidr abbreviated key can use up to 25 bits for subnet
+ * component.
+ */
+#define ABBREV_BITS_INET4_NETMASK_SIZE	6
+#define ABBREV_BITS_INET4_SUBNET		25
+
+/* sortsupport for inet/cidr */
+typedef struct
+{
+	int64		input_count;	/* number of non-null values seen */
+	bool		estimating;		/* true if estimating cardinality */
+
+	hyperLogLogState abbr_card; /* cardinality estimator */
+} network_sortsupport_state;
+
 static int32 network_cmp_internal(inet *a1, inet *a2);
+static int	network_fast_cmp(Datum x, Datum y, SortSupport ssup);
+static int	network_cmp_abbrev(Datum x, Datum y, SortSupport ssup);
+static bool network_abbrev_abort(int memtupcount, SortSupport ssup);
+static Datum network_abbrev_convert(Datum original, SortSupport ssup);
 static List *match_network_function(Node *leftop,
 									Node *rightop,
 									int indexarg,
@@ -406,6 +432,379 @@ network_cmp(PG_FUNCTION_ARGS)
 }
 
 /*
+ * SortSupport strategy routine
+ */
+Datum
+network_sortsupport(PG_FUNCTION_ARGS)
+{
+	SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
+
+	ssup->comparator = network_fast_cmp;
+	ssup->ssup_extra = NULL;
+
+	if (ssup->abbreviate)
+	{
+		network_sortsupport_state *uss;
+		MemoryContext oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
+
+		uss = palloc(sizeof(network_sortsupport_state));
+		uss->input_count = 0;
+		uss->estimating = true;
+		initHyperLogLog(&uss->abbr_card, 10);
+
+		ssup->ssup_extra = uss;
+
+		ssup->comparator = network_cmp_abbrev;
+		ssup->abbrev_converter = network_abbrev_convert;
+		ssup->abbrev_abort = network_abbrev_abort;
+		ssup->abbrev_full_comparator = network_fast_cmp;
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * SortSupport comparison func
+ */
+static int
+network_fast_cmp(Datum x, Datum y, SortSupport ssup)
+{
+	inet	   *arg1 = DatumGetInetPP(x);
+	inet	   *arg2 = DatumGetInetPP(y);
+
+	return network_cmp_internal(arg1, arg2);
+}
+
+/*
+ * Abbreviated key comparison func
+ */
+static int
+network_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
+{
+	if (x > y)
+		return 1;
+	else if (x == y)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization.
+ *
+ * We pay no attention to the cardinality of the non-abbreviated data, because
+ * there is no equality fast-path within authoritative inet comparator.
+ */
+static bool
+network_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+	network_sortsupport_state *uss = ssup->ssup_extra;
+	double		abbr_card;
+
+	if (memtupcount < 10000 || uss->input_count < 10000 || !uss->estimating)
+		return false;
+
+	abbr_card = estimateHyperLogLog(&uss->abbr_card);
+
+	/*
+	 * If we have >100k distinct values, then even if we were sorting many
+	 * billion rows we'd likely still break even, and the penalty of undoing
+	 * that many rows of abbrevs would probably not be worth it. At this point
+	 * we stop counting because we know that we're now fully committed.
+	 */
+	if (abbr_card > 100000.0)
+	{
+#ifdef TRACE_SORT
+		if (trace_sort)
+			elog(LOG,
+				 "network_abbrev: estimation ends at cardinality %f"
+				 " after " INT64_FORMAT " values (%d rows)",
+				 abbr_card, uss->input_count, memtupcount);
+#endif
+		uss->estimating = false;
+		return false;
+	}
+
+	/*
+	 * Target minimum cardinality is 1 per ~2k of non-null inputs. 0.5 row
+	 * fudge factor allows us to abort earlier on genuinely pathological data
+	 * where we've had exactly one abbreviated value in the first 2k
+	 * (non-null) rows.
+	 */
+	if (abbr_card < uss->input_count / 2000.0 + 0.5)
+	{
+#ifdef TRACE_SORT
+		if (trace_sort)
+			elog(LOG,
+				 "network_abbrev: aborting abbreviation at cardinality %f"
+				 " below threshold %f after " INT64_FORMAT " values (%d rows)",
+				 abbr_card, uss->input_count / 2000.0 + 0.5, uss->input_count,
+				 memtupcount);
+#endif
+		return true;
+	}
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG,
+			 "network_abbrev: cardinality %f after " INT64_FORMAT
+			 " values (%d rows)", abbr_card, uss->input_count, memtupcount);
+#endif
+
+	return false;
+}
+
+/*
+ * SortSupport conversion routine. Converts original inet/cidr representation
+ * to abbreviated key representation that works with simple 3-way unsigned int
+ * comparisons. The network_cmp_internal() rules for sorting inet/cidr datums
+ * are followed by abbreviated comparisons by an encoding scheme that
+ * conditions keys through careful use of padding.
+ *
+ * Some background: inet values have three major components (take for example
+ * the address 1.2.3.4/24):
+ *
+ *     * A network, or netmasked bits (1.2.3.0).
+ *     * A netmask size (/24).
+ *     * A subnet, or bits outside of the netmask (0.0.0.4).
+ *
+ * cidr values are the same except that with only the first two components --
+ * all their subnet bits *must* be zero (1.2.3.0/24).
+ *
+ * IPv4 and IPv6 are identical in this makeup, with the difference being that
+ * IPv4 addresses have a maximum of 32 bits compared to IPv6's 64 bits, so in
+ * IPv6 each part may be larger.
+ *
+ * inet/cdir types compare using these sorting rules. If inequality is detected
+ * at any step, comparison is finished. If any rule is a tie, the algorithm
+ * drops through to the next to break it:
+ *
+ *     1. IPv4 always appears before IPv6.
+ *     2. Network bits are compared.
+ *     3. Netmask size is compared.
+ *     4. All bits are compared (having made it here, we know that both
+ *        netmasked bits and netmask size are equal, so we're in effect only
+ *        comparing subnet bits).
+ *
+ * When generating abbreviated keys for SortSupport, we pack as much as we can
+ * into a datum while ensuring that when comparing those keys as integers,
+ * these rules will be respected. Exact contents depend on IP family and datum
+ * size.
+ *
+ * IPv4
+ * ----
+ *
+ * 4 byte datums:
+ *
+ * Start with 1 bit for the IP family (IPv4 or IPv6; this bit is present in
+ * every case below) followed by all but 1 of the netmasked bits.
+ *
+ * +----------+---------------------+
+ * | 1 bit IP |   31 bits network   |     (1 bit network
+ * |  family  |     (truncated)     |      omitted)
+ * +----------+---------------------+
+ *
+ * 8 byte datums:
+ *
+ * We have space to store all netmasked bits, followed by the netmask size,
+ * followed by 25 bits of the subnet (25 bits is usually more than enough in
+ * practice). cidr datums always have all-zero subnet bits.
+ *
+ * +----------+-----------------------+--------------+--------------------+
+ * | 1 bit IP |    32 bits network    |    6 bits    |   25 bits subnet   |
+ * |  family  |        (full)         | network size |    (truncated)     |
+ * +----------+-----------------------+--------------+--------------------+
+ *
+ * IPv6
+ * ----
+ *
+ * 4 byte datums:
+ *
+ * +----------+---------------------+
+ * | 1 bit IP |   31 bits network   |    (up to 97 bits
+ * |  family  |     (truncated)     |   network omitted)
+ * +----------+---------------------+
+ *
+ * 8 byte datums:
+ *
+ * +----------+---------------------------------+
+ * | 1 bit IP |         63 bits network         |    (up to 65 bits
+ * |  family  |           (truncated)           |   network omitted)
+ * +----------+---------------------------------+
+ */
+static Datum
+network_abbrev_convert(Datum original, SortSupport ssup)
+{
+	network_sortsupport_state *uss = ssup->ssup_extra;
+	inet	   *authoritative = DatumGetInetPP(original);
+	Datum		res,
+				ipaddr_datum,
+				subnet_bitmask,
+				network;
+	int			subnet_size;
+
+	Assert(ip_family(authoritative) == PGSQL_AF_INET ||
+		   ip_family(authoritative) == PGSQL_AF_INET6);
+
+	/*
+	 * Get an unsigned integer representation of the IP address by taking its
+	 * first 4 or 8 bytes. Always take all 4 bytes of an IPv4 address. Take
+	 * the first 8 bytes of an IPv6 address with an 8 byte datum and 4 bytes
+	 * otherwise.
+	 *
+	 * We're consuming an array of unsigned char, so byteswap on little endian
+	 * systems (an inet's ipaddr field stores the most significant byte
+	 * first).
+	 */
+	if (ip_family(authoritative) == PGSQL_AF_INET)
+	{
+		uint32		ipaddr_datum32;
+
+		memcpy(&ipaddr_datum32, ip_addr(authoritative), sizeof(uint32));
+
+		/* Must byteswap on little-endian machines */
+#ifndef WORDS_BIGENDIAN
+		ipaddr_datum = pg_bswap32(ipaddr_datum32);
+#else
+		ipaddr_datum = ipaddr_datum32;
+#endif
+
+		/* Initialize result without setting ipfamily bit */
+		res = (Datum) 0;
+	}
+	else
+	{
+		memcpy(&ipaddr_datum, ip_addr(authoritative), sizeof(Datum));
+
+		/* Must byteswap on little-endian machines */
+		ipaddr_datum = DatumBigEndianToNative(ipaddr_datum);
+
+		/* Initialize result with ipfamily (most significant) bit set */
+		res = ((Datum) 1) << (SIZEOF_DATUM * BITS_PER_BYTE - 1);
+	}
+
+	/*
+	 * ipaddr_datum must be "split": high order bits go in "network" component
+	 * of abbreviated key (often with zeroed bits at the end due to masking),
+	 * while low order bits go in "subnet" component when there is space for
+	 * one. This is often accomplished by generating a temp datum subnet
+	 * bitmask, which we may reuse later when generating the subnet bits
+	 * themselves.  (Note that subnet bits are only used with IPv4 datums on
+	 * platforms where datum is 8 bytes.)
+	 *
+	 * The number of bits in subnet is used to generate a datum subnet
+	 * bitmask. For example, with a /24 IPv4 datum there are 8 subnet bits
+	 * (since 32 - 24 is 8), so the final subnet bitmask is B'1111 1111'. We
+	 * need explicit handling for cases where the ipaddr bits cannot all fit
+	 * in a datum, though (otherwise we'd incorrectly mask the network
+	 * component with IPv6 values).
+	 */
+	subnet_size = ip_maxbits(authoritative) - ip_bits(authoritative);
+	Assert(subnet_size >= 0);
+	/* subnet size must work with prefix ipaddr cases */
+	subnet_size %= SIZEOF_DATUM * BITS_PER_BYTE;
+	if (ip_bits(authoritative) == 0)
+	{
+		/* Fit as many ipaddr bits as possible into subnet */
+		subnet_bitmask = ((Datum) 0) - 1;
+		network = 0;
+	}
+	else if (ip_bits(authoritative) < SIZEOF_DATUM * BITS_PER_BYTE)
+	{
+		/* Split ipaddr bits between network and subnet */
+		subnet_bitmask = (((Datum) 1) << subnet_size) - 1;
+		network = ipaddr_datum & ~subnet_bitmask;
+	}
+	else
+	{
+		/* Fit as many ipaddr bits as possible into network */
+		subnet_bitmask = 0;
+		network = ipaddr_datum;
+	}
+
+#if SIZEOF_DATUM == 8
+	if (ip_family(authoritative) == PGSQL_AF_INET)
+	{
+		/*
+		 * IPv4 with 8 byte datums: keep all 32 netmasked bits, netmask size,
+		 * and most significant 25 subnet bits
+		 */
+		Datum		netmask_size = (Datum) ip_bits(authoritative);
+		Datum		subnet;
+
+		/*
+		 * Shift left 31 bits: 6 bits netmask size + 25 subnet bits.
+		 *
+		 * We don't make any distinction between network bits that are zero
+		 * due to masking and "true"/non-masked zero bits.  An abbreviated
+		 * comparison that is resolved by comparing a non-masked and non-zero
+		 * bit to a masked/zeroed bit is effectively resolved based on
+		 * ip_bits(), even though the comparison won't reach the netmask_size
+		 * bits.
+		 */
+		network <<= (ABBREV_BITS_INET4_NETMASK_SIZE +
+					 ABBREV_BITS_INET4_SUBNET);
+
+		/* Shift size to make room for subnet bits at the end */
+		netmask_size <<= ABBREV_BITS_INET4_SUBNET;
+
+		/* Extract subnet bits without shifting them */
+		subnet = ipaddr_datum & subnet_bitmask;
+
+		/*
+		 * If we have more than 25 subnet bits, we can't fit everything. Shift
+		 * subnet down to avoid clobbering bits that are only supposed to be
+		 * used for netmask_size.
+		 *
+		 * Discarding the least significant subnet bits like this is correct
+		 * because abbreviated comparisons that are resolved at the subnet
+		 * level must have had equal netmask_size/ip_bits() values in order to
+		 * get that far.
+		 */
+		if (subnet_size > ABBREV_BITS_INET4_SUBNET)
+			subnet >>= subnet_size - ABBREV_BITS_INET4_SUBNET;
+
+		/*
+		 * Assemble the final abbreviated key without clobbering the ipfamily
+		 * bit that must remain a zero.
+		 */
+		res |= network | netmask_size | subnet;
+	}
+	else
+#endif
+	{
+		/*
+		 * 4 byte datums, or IPv6 with 8 byte datums: Use as many of the
+		 * netmasked bits as will fit in final abbreviated key. Avoid
+		 * clobbering the ipfamily bit that was set earlier.
+		 */
+		res |= network >> 1;
+	}
+
+	uss->input_count += 1;
+
+	/* Hash abbreviated key */
+	if (uss->estimating)
+	{
+		uint32		tmp;
+
+#if SIZEOF_DATUM == 8
+		tmp = (uint32) res ^ (uint32) ((uint64) res >> 32);
+#else							/* SIZEOF_DATUM != 8 */
+		tmp = (uint32) res;
+#endif
+
+		addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+	}
+
+	return res;
+}
+
+/*
  *	Boolean ordering tests.
  */
 Datum