Add 'noError' argument to encoding conversion functions.

With the 'noError' argument, you can try to convert a buffer without knowing the character boundaries beforehand. The functions now need to return the number of input bytes successfully converted. This is is a backwards-incompatible change, if you have created a custom encoding conversion with CREATE CONVERSION. This adds a check to pg_upgrade for that, refusing the upgrade if there are any user-defined encoding conversions. Custom conversions are very rare, there are no commonly used extensions that I know of that uses that feature. No other objects can depend on conversions, so if you do have one, you can fairly easily drop it before upgrading, and recreate it after the upgrade with an updated version. Add regression tests for built-in encoding conversions. This doesn't cover every conversion, but it covers all the internal functions in conv.c that are used to implement the conversions. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
author: Heikki Linnakangas 2021-04-01 08:45:22 +0000
committer: Heikki Linnakangas 2021-04-01 08:45:22 +0000
commit: ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (patch)
tree: 1a2c73601043edd7d6512fba51859cc860215e15 /src/backend/utils/mb/conv.c
parent: e2639a767bfa1afebaf1877515a1187feb393443 (diff)
1 files changed, 121 insertions, 18 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index a07b54bd3b8..33e9c9a9e3c 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -25,15 +25,20 @@
  * tab holds conversion entries for the source charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the target charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 local2local(const unsigned char *l,
 			unsigned char *p,
 			int len,
 			int src_encoding,
 			int dest_encoding,
-			const unsigned char *tab)
+			const unsigned char *tab,
+			bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -41,7 +46,11 @@ local2local(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(src_encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -50,13 +59,19 @@ local2local(const unsigned char *l,
 			if (c2)
 				*p++ = c2;
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(src_encoding, dest_encoding,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -66,18 +81,26 @@ local2local(const unsigned char *l,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic(const unsigned char *l, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = l;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (IS_HIGHBIT_SET(c1))
 			*p++ = lc;
 		*p++ = c1;
@@ -85,6 +108,8 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -94,18 +119,26 @@ latin2mic(const unsigned char *l, unsigned char *p, int len,
  * p is the output area (must be large enough!)
  * lc is the mule character set id for the local encoding
  * encoding is the PG identifier for the local encoding
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin(const unsigned char *mic, unsigned char *p, int len,
-		  int lc, int encoding)
+		  int lc, int encoding, bool noError)
 {
+	const unsigned char *start = mic;
 	int			c1;
 
 	while (len > 0)
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -118,17 +151,27 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
+			}
 			*p++ = mic[1];
 			mic += 2;
 			len -= 2;
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 
@@ -143,15 +186,20 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len,
  * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the mule encoding, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 latin2mic_with_table(const unsigned char *l,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = l;
 	unsigned char c1,
 				c2;
 
@@ -159,7 +207,11 @@ latin2mic_with_table(const unsigned char *l,
 	{
 		c1 = *l;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(encoding, (const char *) l, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 			*p++ = c1;
 		else
@@ -171,13 +223,19 @@ latin2mic_with_table(const unsigned char *l,
 				*p++ = c2;
 			}
 			else
+			{
+				if (noError)
+					break;
 				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 										   (const char *) l, len);
+			}
 		}
 		l++;
 		len--;
 	}
 	*p = '\0';
+
+	return l - start;
 }
 
 /*
@@ -191,15 +249,20 @@ latin2mic_with_table(const unsigned char *l,
  * tab holds conversion entries for the mule internal code's second byte,
  * starting from 128 (0x80). each entry in the table holds the corresponding
  * code point for the local charset, or 0 if there is no equivalent code.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 mic2latin_with_table(const unsigned char *mic,
 					 unsigned char *p,
 					 int len,
 					 int lc,
 					 int encoding,
-					 const unsigned char *tab)
+					 const unsigned char *tab,
+					 bool noError)
 {
+	const unsigned char *start = mic;
 	unsigned char c1,
 				c2;
 
@@ -207,7 +270,11 @@ mic2latin_with_table(const unsigned char *mic,
 	{
 		c1 = *mic;
 		if (c1 == 0)
+		{
+			if (noError)
+				break;
 			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+		}
 		if (!IS_HIGHBIT_SET(c1))
 		{
 			/* easy for ASCII */
@@ -220,11 +287,17 @@ mic2latin_with_table(const unsigned char *mic,
 			int			l = pg_mule_mblen(mic);
 
 			if (len < l)
+			{
+				if (noError)
+					break;
 				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 										len);
+			}
 			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 				(c2 = tab[mic[1] - HIGHBIT]) == 0)
 			{
+				if (noError)
+					break;
 				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 										   (const char *) mic, len);
 				break;			/* keep compiler quiet */
@@ -235,6 +308,8 @@ mic2latin_with_table(const unsigned char *mic,
 		}
 	}
 	*p = '\0';
+
+	return mic - start;
 }
 
 /*
@@ -424,18 +499,22 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt,
  * is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 UtfToLocal(const unsigned char *utf, int len,
 		   unsigned char *iso,
 		   const pg_mb_radix_tree *map,
 		   const pg_utf_to_local_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding, bool noError)
 {
 	uint32		iutf;
 	int			l;
 	const pg_utf_to_local_combined *cp;
+	const unsigned char *start = utf;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -505,10 +584,19 @@ UtfToLocal(const unsigned char *utf, int len,
 
 			l = pg_utf_mblen(utf);
 			if (len < l)
+			{
+				/* need more data to decide if this is a combined char */
+				utf -= l_save;
 				break;
+			}
 
 			if (!pg_utf8_islegal(utf, l))
+			{
+				if (!noError)
+					report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+				utf -= l_save;
 				break;
+			}
 
 			/* We assume ASCII character cannot be in combined map */
 			if (l > 1)
@@ -584,15 +672,20 @@ UtfToLocal(const unsigned char *utf, int len,
 		}
 
 		/* failed to translate this character */
+		utf -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(PG_UTF8, encoding,
-								   (const char *) (utf - l), len);
+								   (const char *) utf, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 
 	*iso = '\0';
+
+	return utf - start;
 }
 
 /*
@@ -616,18 +709,23 @@ UtfToLocal(const unsigned char *utf, int len,
  * (if provided) is applied.  An error is raised if no match is found.
  *
  * See pg_wchar.h for more details about the data structures used here.
+ *
+ * Returns the number of input bytes consumed.  If noError is true, this can
+ * be less than 'len'.
  */
-void
+int
 LocalToUtf(const unsigned char *iso, int len,
 		   unsigned char *utf,
 		   const pg_mb_radix_tree *map,
 		   const pg_local_to_utf_combined *cmap, int cmapsize,
 		   utf_local_conversion_func conv_func,
-		   int encoding)
+		   int encoding,
+		   bool noError)
 {
 	uint32		iiso;
 	int			l;
 	const pg_local_to_utf_combined *cp;
+	const unsigned char *start = iso;
 
 	if (!PG_VALID_ENCODING(encoding))
 		ereport(ERROR,
@@ -723,13 +821,18 @@ LocalToUtf(const unsigned char *iso, int len,
 		}
 
 		/* failed to translate this character */
+		iso -= l;
+		if (noError)
+			break;
 		report_untranslatable_char(encoding, PG_UTF8,
-								   (const char *) (iso - l), len);
+								   (const char *) iso, len);
 	}
 
 	/* if we broke out of loop early, must be invalid input */
-	if (len > 0)
+	if (len > 0 && !noError)
 		report_invalid_encoding(encoding, (const char *) iso, len);
 
 	*utf = '\0';
+
+	return iso - start;
 }
author	Heikki Linnakangas	2021-04-01 08:45:22 +0000
committer	Heikki Linnakangas	2021-04-01 08:45:22 +0000
commit	ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (patch)
tree	1a2c73601043edd7d6512fba51859cc860215e15 /src/backend/utils/mb/conv.c
parent	e2639a767bfa1afebaf1877515a1187feb393443 (diff)