diff options
Diffstat (limited to 'src/backend/access/common')
-rw-r--r-- | src/backend/access/common/Makefile | 5 | ||||
-rw-r--r-- | src/backend/access/common/detoast.c | 869 | ||||
-rw-r--r-- | src/backend/access/common/heaptuple.c | 4 | ||||
-rw-r--r-- | src/backend/access/common/indextuple.c | 9 | ||||
-rw-r--r-- | src/backend/access/common/reloptions.c | 2 | ||||
-rw-r--r-- | src/backend/access/common/toast_internals.c | 632 |
6 files changed, 1515 insertions, 6 deletions
diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile index d4695043377..9ac19d9f9e5 100644 --- a/src/backend/access/common/Makefile +++ b/src/backend/access/common/Makefile @@ -12,7 +12,8 @@ subdir = src/backend/access/common top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \ - relation.o reloptions.o scankey.o session.o tupconvert.o tupdesc.o +OBJS = bufmask.o detoast.o heaptuple.o indextuple.o printsimple.o \ + printtup.o relation.o reloptions.o scankey.o session.o toast_internals.o \ + tupconvert.o tupdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/detoast.c b/src/backend/access/common/detoast.c new file mode 100644 index 00000000000..c8b49d6a124 --- /dev/null +++ b/src/backend/access/common/detoast.c @@ -0,0 +1,869 @@ +/*------------------------------------------------------------------------- + * + * detoast.c + * Retrieve compressed or external variable size attributes. + * + * Copyright (c) 2000-2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/common/detoast.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heaptoast.h" +#include "access/table.h" +#include "access/toast_internals.h" +#include "common/pg_lzcompress.h" +#include "utils/expandeddatum.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" + +static struct varlena *toast_fetch_datum(struct varlena *attr); +static struct varlena *toast_fetch_datum_slice(struct varlena *attr, + int32 sliceoffset, int32 length); +static struct varlena *toast_decompress_datum(struct varlena *attr); +static struct varlena *toast_decompress_datum_slice(struct varlena *attr, int32 slicelength); + +/* ---------- + * heap_tuple_fetch_attr - + * + * Public entry point to get back a toasted value from + * external source (possibly still in compressed format). + * + * This will return a datum that contains all the data internally, ie, not + * relying on external storage or memory, but it can still be compressed or + * have a short header. Note some callers assume that if the input is an + * EXTERNAL datum, the result will be a pfree'able chunk. + * ---------- + */ +struct varlena * +heap_tuple_fetch_attr(struct varlena *attr) +{ + struct varlena *result; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* + * This is an external stored plain value + */ + result = toast_fetch_datum(attr); + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + /* + * This is an indirect pointer --- dereference it + */ + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + attr = (struct varlena *) redirect.pointer; + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); + + /* recurse if value is still external in some other way */ + if (VARATT_IS_EXTERNAL(attr)) + return heap_tuple_fetch_attr(attr); + + /* + * Copy into the caller's memory context, in case caller tries to + * pfree the result. + */ + result = (struct varlena *) palloc(VARSIZE_ANY(attr)); + memcpy(result, attr, VARSIZE_ANY(attr)); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + /* + * This is an expanded-object pointer --- get flat format + */ + ExpandedObjectHeader *eoh; + Size resultsize; + + eoh = DatumGetEOHP(PointerGetDatum(attr)); + resultsize = EOH_get_flat_size(eoh); + result = (struct varlena *) palloc(resultsize); + EOH_flatten_into(eoh, (void *) result, resultsize); + } + else + { + /* + * This is a plain value inside of the main tuple - why am I called? + */ + result = attr; + } + + return result; +} + + +/* ---------- + * heap_tuple_untoast_attr - + * + * Public entry point to get back a toasted value from compression + * or external storage. The result is always non-extended varlena form. + * + * Note some callers assume that if the input is an EXTERNAL or COMPRESSED + * datum, the result will be a pfree'able chunk. + * ---------- + */ +struct varlena * +heap_tuple_untoast_attr(struct varlena *attr) +{ + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* + * This is an externally stored datum --- fetch it back from there + */ + attr = toast_fetch_datum(attr); + /* If it's compressed, decompress it */ + if (VARATT_IS_COMPRESSED(attr)) + { + struct varlena *tmp = attr; + + attr = toast_decompress_datum(tmp); + pfree(tmp); + } + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + /* + * This is an indirect pointer --- dereference it + */ + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + attr = (struct varlena *) redirect.pointer; + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); + + /* recurse in case value is still extended in some other way */ + attr = heap_tuple_untoast_attr(attr); + + /* if it isn't, we'd better copy it */ + if (attr == (struct varlena *) redirect.pointer) + { + struct varlena *result; + + result = (struct varlena *) palloc(VARSIZE_ANY(attr)); + memcpy(result, attr, VARSIZE_ANY(attr)); + attr = result; + } + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + /* + * This is an expanded-object pointer --- get flat format + */ + attr = heap_tuple_fetch_attr(attr); + /* flatteners are not allowed to produce compressed/short output */ + Assert(!VARATT_IS_EXTENDED(attr)); + } + else if (VARATT_IS_COMPRESSED(attr)) + { + /* + * This is a compressed value inside of the main tuple + */ + attr = toast_decompress_datum(attr); + } + else if (VARATT_IS_SHORT(attr)) + { + /* + * This is a short-header varlena --- convert to 4-byte header format + */ + Size data_size = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT; + Size new_size = data_size + VARHDRSZ; + struct varlena *new_attr; + + new_attr = (struct varlena *) palloc(new_size); + SET_VARSIZE(new_attr, new_size); + memcpy(VARDATA(new_attr), VARDATA_SHORT(attr), data_size); + attr = new_attr; + } + + return attr; +} + + +/* ---------- + * heap_tuple_untoast_attr_slice - + * + * Public entry point to get back part of a toasted value + * from compression or external storage. + * ---------- + */ +struct varlena * +heap_tuple_untoast_attr_slice(struct varlena *attr, + int32 sliceoffset, int32 slicelength) +{ + struct varlena *preslice; + struct varlena *result; + char *attrdata; + int32 attrsize; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* fast path for non-compressed external datums */ + if (!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + return toast_fetch_datum_slice(attr, sliceoffset, slicelength); + + /* fetch it back (compressed marker will get set automatically) */ + preslice = toast_fetch_datum(attr); + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect redirect; + + VARATT_EXTERNAL_GET_POINTER(redirect, attr); + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(redirect.pointer)); + + return heap_tuple_untoast_attr_slice(redirect.pointer, + sliceoffset, slicelength); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + /* pass it off to heap_tuple_fetch_attr to flatten */ + preslice = heap_tuple_fetch_attr(attr); + } + else + preslice = attr; + + Assert(!VARATT_IS_EXTERNAL(preslice)); + + if (VARATT_IS_COMPRESSED(preslice)) + { + struct varlena *tmp = preslice; + + /* Decompress enough to encompass the slice and the offset */ + if (slicelength > 0 && sliceoffset >= 0) + preslice = toast_decompress_datum_slice(tmp, slicelength + sliceoffset); + else + preslice = toast_decompress_datum(tmp); + + if (tmp != attr) + pfree(tmp); + } + + if (VARATT_IS_SHORT(preslice)) + { + attrdata = VARDATA_SHORT(preslice); + attrsize = VARSIZE_SHORT(preslice) - VARHDRSZ_SHORT; + } + else + { + attrdata = VARDATA(preslice); + attrsize = VARSIZE(preslice) - VARHDRSZ; + } + + /* slicing of datum for compressed cases and plain value */ + + if (sliceoffset >= attrsize) + { + sliceoffset = 0; + slicelength = 0; + } + + if (((sliceoffset + slicelength) > attrsize) || slicelength < 0) + slicelength = attrsize - sliceoffset; + + result = (struct varlena *) palloc(slicelength + VARHDRSZ); + SET_VARSIZE(result, slicelength + VARHDRSZ); + + memcpy(VARDATA(result), attrdata + sliceoffset, slicelength); + + if (preslice != attr) + pfree(preslice); + + return result; +} + +/* ---------- + * toast_fetch_datum - + * + * Reconstruct an in memory Datum from the chunks saved + * in the toast relation + * ---------- + */ +static struct varlena * +toast_fetch_datum(struct varlena *attr) +{ + Relation toastrel; + Relation *toastidxs; + ScanKeyData toastkey; + SysScanDesc toastscan; + HeapTuple ttup; + TupleDesc toasttupDesc; + struct varlena *result; + struct varatt_external toast_pointer; + int32 ressize; + int32 residx, + nextidx; + int32 numchunks; + Pointer chunk; + bool isnull; + char *chunkdata; + int32 chunksize; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + elog(ERROR, "toast_fetch_datum shouldn't be called for non-ondisk datums"); + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + ressize = toast_pointer.va_extsize; + numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; + + result = (struct varlena *) palloc(ressize + VARHDRSZ); + + if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) + SET_VARSIZE_COMPRESSED(result, ressize + VARHDRSZ); + else + SET_VARSIZE(result, ressize + VARHDRSZ); + + /* + * Open the toast relation and its indexes + */ + toastrel = table_open(toast_pointer.va_toastrelid, AccessShareLock); + toasttupDesc = toastrel->rd_att; + + /* Look for the valid index of the toast relation */ + validIndex = toast_open_indexes(toastrel, + AccessShareLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to fetch from the index by va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(toast_pointer.va_valueid)); + + /* + * Read the chunks by index + * + * Note that because the index is actually on (valueid, chunkidx) we will + * see the chunks in chunkidx order, even though we didn't explicitly ask + * for it. + */ + nextidx = 0; + + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, 1, &toastkey); + while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + /* + * Have a chunk, extract the sequence number and the data + */ + residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); + Assert(!isnull); + chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); + Assert(!isnull); + if (!VARATT_IS_EXTENDED(chunk)) + { + chunksize = VARSIZE(chunk) - VARHDRSZ; + chunkdata = VARDATA(chunk); + } + else if (VARATT_IS_SHORT(chunk)) + { + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + chunkdata = VARDATA_SHORT(chunk); + } + else + { + /* should never happen */ + elog(ERROR, "found toasted toast chunk for toast value %u in %s", + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + chunksize = 0; /* keep compiler quiet */ + chunkdata = NULL; + } + + /* + * Some checks on the data we've found + */ + if (residx != nextidx) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk number %d (expected %d) for toast value %u in %s", + residx, nextidx, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)))); + if (residx < numchunks - 1) + { + if (chunksize != TOAST_MAX_CHUNK_SIZE) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s", + chunksize, (int) TOAST_MAX_CHUNK_SIZE, + residx, numchunks, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)))); + } + else if (residx == numchunks - 1) + { + if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s", + chunksize, + (int) (ressize - residx * TOAST_MAX_CHUNK_SIZE), + residx, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)))); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", + residx, + 0, numchunks - 1, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)))); + + /* + * Copy the data into proper place in our result + */ + memcpy(VARDATA(result) + residx * TOAST_MAX_CHUNK_SIZE, + chunkdata, + chunksize); + + nextidx++; + } + + /* + * Final checks that we successfully fetched the datum + */ + if (nextidx != numchunks) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("missing chunk number %d for toast value %u in %s", + nextidx, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)))); + + /* + * End scan and close relations + */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, AccessShareLock); + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_fetch_datum_slice - + * + * Reconstruct a segment of a Datum from the chunks saved + * in the toast relation + * + * Note that this function only supports non-compressed external datums. + * ---------- + */ +static struct varlena * +toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 length) +{ + Relation toastrel; + Relation *toastidxs; + ScanKeyData toastkey[3]; + int nscankeys; + SysScanDesc toastscan; + HeapTuple ttup; + TupleDesc toasttupDesc; + struct varlena *result; + struct varatt_external toast_pointer; + int32 attrsize; + int32 residx; + int32 nextidx; + int numchunks; + int startchunk; + int endchunk; + int32 startoffset; + int32 endoffset; + int totalchunks; + Pointer chunk; + bool isnull; + char *chunkdata; + int32 chunksize; + int32 chcpystrt; + int32 chcpyend; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + elog(ERROR, "toast_fetch_datum_slice shouldn't be called for non-ondisk datums"); + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* + * It's nonsense to fetch slices of a compressed datum -- this isn't lo_* + * we can't return a compressed datum which is meaningful to toast later + */ + Assert(!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); + + attrsize = toast_pointer.va_extsize; + totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; + + if (sliceoffset >= attrsize) + { + sliceoffset = 0; + length = 0; + } + + if (((sliceoffset + length) > attrsize) || length < 0) + length = attrsize - sliceoffset; + + result = (struct varlena *) palloc(length + VARHDRSZ); + + SET_VARSIZE(result, length + VARHDRSZ); + + if (length == 0) + return result; /* Can save a lot of work at this point! */ + + startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; + endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; + numchunks = (endchunk - startchunk) + 1; + + startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; + endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; + + /* + * Open the toast relation and its indexes + */ + toastrel = table_open(toast_pointer.va_toastrelid, AccessShareLock); + toasttupDesc = toastrel->rd_att; + + /* Look for the valid index of toast relation */ + validIndex = toast_open_indexes(toastrel, + AccessShareLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to fetch from the index. This is either two keys or + * three depending on the number of chunks. + */ + ScanKeyInit(&toastkey[0], + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(toast_pointer.va_valueid)); + + /* + * Use equality condition for one chunk, a range condition otherwise: + */ + if (numchunks == 1) + { + ScanKeyInit(&toastkey[1], + (AttrNumber) 2, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(startchunk)); + nscankeys = 2; + } + else + { + ScanKeyInit(&toastkey[1], + (AttrNumber) 2, + BTGreaterEqualStrategyNumber, F_INT4GE, + Int32GetDatum(startchunk)); + ScanKeyInit(&toastkey[2], + (AttrNumber) 2, + BTLessEqualStrategyNumber, F_INT4LE, + Int32GetDatum(endchunk)); + nscankeys = 3; + } + + /* + * Read the chunks by index + * + * The index is on (valueid, chunkidx) so they will come in order + */ + init_toast_snapshot(&SnapshotToast); + nextidx = startchunk; + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, nscankeys, toastkey); + while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + /* + * Have a chunk, extract the sequence number and the data + */ + residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); + Assert(!isnull); + chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); + Assert(!isnull); + if (!VARATT_IS_EXTENDED(chunk)) + { + chunksize = VARSIZE(chunk) - VARHDRSZ; + chunkdata = VARDATA(chunk); + } + else if (VARATT_IS_SHORT(chunk)) + { + /* could happen due to heap_form_tuple doing its thing */ + chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; + chunkdata = VARDATA_SHORT(chunk); + } + else + { + /* should never happen */ + elog(ERROR, "found toasted toast chunk for toast value %u in %s", + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + chunksize = 0; /* keep compiler quiet */ + chunkdata = NULL; + } + + /* + * Some checks on the data we've found + */ + if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) + elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s", + residx, nextidx, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + if (residx < totalchunks - 1) + { + if (chunksize != TOAST_MAX_CHUNK_SIZE) + elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s when fetching slice", + chunksize, (int) TOAST_MAX_CHUNK_SIZE, + residx, totalchunks, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + } + else if (residx == totalchunks - 1) + { + if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) + elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s when fetching slice", + chunksize, + (int) (attrsize - residx * TOAST_MAX_CHUNK_SIZE), + residx, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + } + else + elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", + residx, + 0, totalchunks - 1, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + + /* + * Copy the data into proper place in our result + */ + chcpystrt = 0; + chcpyend = chunksize - 1; + if (residx == startchunk) + chcpystrt = startoffset; + if (residx == endchunk) + chcpyend = endoffset; + + memcpy(VARDATA(result) + + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, + chunkdata + chcpystrt, + (chcpyend - chcpystrt) + 1); + + nextidx++; + } + + /* + * Final checks that we successfully fetched the datum + */ + if (nextidx != (endchunk + 1)) + elog(ERROR, "missing chunk number %d for toast value %u in %s", + nextidx, + toast_pointer.va_valueid, + RelationGetRelationName(toastrel)); + + /* + * End scan and close relations + */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, AccessShareLock); + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_decompress_datum - + * + * Decompress a compressed version of a varlena datum + */ +static struct varlena * +toast_decompress_datum(struct varlena *attr) +{ + struct varlena *result; + + Assert(VARATT_IS_COMPRESSED(attr)); + + result = (struct varlena *) + palloc(TOAST_COMPRESS_RAWSIZE(attr) + VARHDRSZ); + SET_VARSIZE(result, TOAST_COMPRESS_RAWSIZE(attr) + VARHDRSZ); + + if (pglz_decompress(TOAST_COMPRESS_RAWDATA(attr), + VARSIZE(attr) - TOAST_COMPRESS_HDRSZ, + VARDATA(result), + TOAST_COMPRESS_RAWSIZE(attr), true) < 0) + elog(ERROR, "compressed data is corrupted"); + + return result; +} + + +/* ---------- + * toast_decompress_datum_slice - + * + * Decompress the front of a compressed version of a varlena datum. + * offset handling happens in heap_tuple_untoast_attr_slice. + * Here we just decompress a slice from the front. + */ +static struct varlena * +toast_decompress_datum_slice(struct varlena *attr, int32 slicelength) +{ + struct varlena *result; + int32 rawsize; + + Assert(VARATT_IS_COMPRESSED(attr)); + + result = (struct varlena *) palloc(slicelength + VARHDRSZ); + + rawsize = pglz_decompress(TOAST_COMPRESS_RAWDATA(attr), + VARSIZE(attr) - TOAST_COMPRESS_HDRSZ, + VARDATA(result), + slicelength, false); + if (rawsize < 0) + elog(ERROR, "compressed data is corrupted"); + + SET_VARSIZE(result, rawsize + VARHDRSZ); + return result; +} + +/* ---------- + * toast_raw_datum_size - + * + * Return the raw (detoasted) size of a varlena datum + * (including the VARHDRSZ header) + * ---------- + */ +Size +toast_raw_datum_size(Datum value) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* va_rawsize is the size of the original datum -- including header */ + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + result = toast_pointer.va_rawsize; + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(toast_pointer.pointer)); + + return toast_raw_datum_size(PointerGetDatum(toast_pointer.pointer)); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + result = EOH_get_flat_size(DatumGetEOHP(value)); + } + else if (VARATT_IS_COMPRESSED(attr)) + { + /* here, va_rawsize is just the payload size */ + result = VARRAWSIZE_4B_C(attr) + VARHDRSZ; + } + else if (VARATT_IS_SHORT(attr)) + { + /* + * we have to normalize the header length to VARHDRSZ or else the + * callers of this function will be confused. + */ + result = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT + VARHDRSZ; + } + else + { + /* plain untoasted datum */ + result = VARSIZE(attr); + } + return result; +} + +/* ---------- + * toast_datum_size + * + * Return the physical storage size (possibly compressed) of a varlena datum + * ---------- + */ +Size +toast_datum_size(Datum value) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + Size result; + + if (VARATT_IS_EXTERNAL_ONDISK(attr)) + { + /* + * Attribute is stored externally - return the extsize whether + * compressed or not. We do not count the size of the toast pointer + * ... should we? + */ + struct varatt_external toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + result = toast_pointer.va_extsize; + } + else if (VARATT_IS_EXTERNAL_INDIRECT(attr)) + { + struct varatt_indirect toast_pointer; + + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* nested indirect Datums aren't allowed */ + Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr)); + + return toast_datum_size(PointerGetDatum(toast_pointer.pointer)); + } + else if (VARATT_IS_EXTERNAL_EXPANDED(attr)) + { + result = EOH_get_flat_size(DatumGetEOHP(value)); + } + else if (VARATT_IS_SHORT(attr)) + { + result = VARSIZE_SHORT(attr); + } + else + { + /* + * Attribute is stored inline either compressed or not, just calculate + * the size of the datum in either case. + */ + result = VARSIZE(attr); + } + return result; +} diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index a48a6cd757f..cc948958d7e 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -18,7 +18,7 @@ * (In performance-critical code paths we can use pg_detoast_datum_packed * and the appropriate access macros to avoid that overhead.) Note that this * conversion is performed directly in heap_form_tuple, without invoking - * tuptoaster.c. + * heaptoast.c. * * This change will break any code that assumes it needn't detoast values * that have been put into a tuple but never sent to disk. Hopefully there @@ -57,9 +57,9 @@ #include "postgres.h" +#include "access/heaptoast.h" #include "access/sysattr.h" #include "access/tupdesc_details.h" -#include "access/tuptoaster.h" #include "executor/tuptable.h" #include "utils/expandeddatum.h" diff --git a/src/backend/access/common/indextuple.c b/src/backend/access/common/indextuple.c index cb23be859de..07586201b9d 100644 --- a/src/backend/access/common/indextuple.c +++ b/src/backend/access/common/indextuple.c @@ -16,10 +16,17 @@ #include "postgres.h" +#include "access/detoast.h" +#include "access/heaptoast.h" #include "access/htup_details.h" #include "access/itup.h" -#include "access/tuptoaster.h" +#include "access/toast_internals.h" +/* + * This enables de-toasting of index entries. Needed until VACUUM is + * smart enough to rebuild indexes from scratch. + */ +#define TOAST_INDEX_HACK /* ---------------------------------------------------------------- * index_ tuple interface routines diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 42647b05265..20f4ed3c386 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -19,11 +19,11 @@ #include "access/gist_private.h" #include "access/hash.h" +#include "access/heaptoast.h" #include "access/htup_details.h" #include "access/nbtree.h" #include "access/reloptions.h" #include "access/spgist.h" -#include "access/tuptoaster.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c new file mode 100644 index 00000000000..a9712424901 --- /dev/null +++ b/src/backend/access/common/toast_internals.c @@ -0,0 +1,632 @@ +/*------------------------------------------------------------------------- + * + * toast_internals.c + * Functions for internal use by the TOAST system. + * + * Copyright (c) 2000-2019, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/common/toast_internals.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/table.h" +#include "access/toast_internals.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "common/pg_lzcompress.h" +#include "miscadmin.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); +static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); + +/* ---------- + * toast_compress_datum - + * + * Create a compressed version of a varlena datum + * + * If we fail (ie, compressed result is actually bigger than original) + * then return NULL. We must not use compressed data if it'd expand + * the tuple! + * + * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without + * copying them. But we can't handle external or compressed datums. + * ---------- + */ +Datum +toast_compress_datum(Datum value) +{ + struct varlena *tmp; + int32 valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); + int32 len; + + Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); + Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); + + /* + * No point in wasting a palloc cycle if value size is out of the allowed + * range for compression + */ + if (valsize < PGLZ_strategy_default->min_input_size || + valsize > PGLZ_strategy_default->max_input_size) + return PointerGetDatum(NULL); + + tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize) + + TOAST_COMPRESS_HDRSZ); + + /* + * We recheck the actual size even if pglz_compress() reports success, + * because it might be satisfied with having saved as little as one byte + * in the compressed data --- which could turn into a net loss once you + * consider header and alignment padding. Worst case, the compressed + * format might require three padding bytes (plus header, which is + * included in VARSIZE(tmp)), whereas the uncompressed format would take + * only one header byte and no padding if the value is short enough. So + * we insist on a savings of more than 2 bytes to ensure we have a gain. + */ + len = pglz_compress(VARDATA_ANY(DatumGetPointer(value)), + valsize, + TOAST_COMPRESS_RAWDATA(tmp), + PGLZ_strategy_default); + if (len >= 0 && + len + TOAST_COMPRESS_HDRSZ < valsize - 2) + { + TOAST_COMPRESS_SET_RAWSIZE(tmp, valsize); + SET_VARSIZE_COMPRESSED(tmp, len + TOAST_COMPRESS_HDRSZ); + /* successful compression */ + return PointerGetDatum(tmp); + } + else + { + /* incompressible data */ + pfree(tmp); + return PointerGetDatum(NULL); + } +} + +/* ---------- + * toast_save_datum - + * + * Save one single datum into the secondary relation and return + * a Datum reference for it. + * + * rel: the main relation we're working with (not the toast rel!) + * value: datum to be pushed to toast storage + * oldexternal: if not NULL, toast pointer previously representing the datum + * options: options to be passed to heap_insert() for toast rows + * ---------- + */ +Datum +toast_save_datum(Relation rel, Datum value, + struct varlena *oldexternal, int options) +{ + Relation toastrel; + Relation *toastidxs; + HeapTuple toasttup; + TupleDesc toasttupDesc; + Datum t_values[3]; + bool t_isnull[3]; + CommandId mycid = GetCurrentCommandId(true); + struct varlena *result; + struct varatt_external toast_pointer; + union + { + struct varlena hdr; + /* this is to make the union big enough for a chunk: */ + char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; + /* ensure union is aligned well enough: */ + int32 align_it; + } chunk_data; + int32 chunk_size; + int32 chunk_seq = 0; + char *data_p; + int32 data_todo; + Pointer dval = DatumGetPointer(value); + int num_indexes; + int validIndex; + + Assert(!VARATT_IS_EXTERNAL(value)); + + /* + * Open the toast relation and its indexes. We can use the index to check + * uniqueness of the OID we assign to the toasted item, even though it has + * additional columns besides OID. + */ + toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); + toasttupDesc = toastrel->rd_att; + + /* Open all the toast indexes and look for the valid one */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Get the data pointer and length, and compute va_rawsize and va_extsize. + * + * va_rawsize is the size of the equivalent fully uncompressed datum, so + * we have to adjust for short headers. + * + * va_extsize is the actual size of the data payload in the toast records. + */ + if (VARATT_IS_SHORT(dval)) + { + data_p = VARDATA_SHORT(dval); + data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; + toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ + toast_pointer.va_extsize = data_todo; + } + else if (VARATT_IS_COMPRESSED(dval)) + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + /* rawsize in a compressed datum is just the size of the payload */ + toast_pointer.va_rawsize = VARRAWSIZE_4B_C(dval) + VARHDRSZ; + toast_pointer.va_extsize = data_todo; + /* Assert that the numbers look like it's compressed */ + Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); + } + else + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + toast_pointer.va_rawsize = VARSIZE(dval); + toast_pointer.va_extsize = data_todo; + } + + /* + * Insert the correct table OID into the result TOAST pointer. + * + * Normally this is the actual OID of the target toast table, but during + * table-rewriting operations such as CLUSTER, we have to insert the OID + * of the table's real permanent toast table instead. rd_toastoid is set + * if we have to substitute such an OID. + */ + if (OidIsValid(rel->rd_toastoid)) + toast_pointer.va_toastrelid = rel->rd_toastoid; + else + toast_pointer.va_toastrelid = RelationGetRelid(toastrel); + + /* + * Choose an OID to use as the value ID for this toast value. + * + * Normally we just choose an unused OID within the toast table. But + * during table-rewriting operations where we are preserving an existing + * toast table OID, we want to preserve toast value OIDs too. So, if + * rd_toastoid is set and we had a prior external value from that same + * toast table, re-use its value ID. If we didn't have a prior external + * value (which is a corner case, but possible if the table's attstorage + * options have been changed), we have to pick a value ID that doesn't + * conflict with either new or existing toast value OIDs. + */ + if (!OidIsValid(rel->rd_toastoid)) + { + /* normal case: just choose an unused OID */ + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } + else + { + /* rewrite case: check to see if value was in old toast table */ + toast_pointer.va_valueid = InvalidOid; + if (oldexternal != NULL) + { + struct varatt_external old_toast_pointer; + + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); + if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) + { + /* This value came from the old toast table; reuse its OID */ + toast_pointer.va_valueid = old_toast_pointer.va_valueid; + + /* + * There is a corner case here: the table rewrite might have + * to copy both live and recently-dead versions of a row, and + * those versions could easily reference the same toast value. + * When we copy the second or later version of such a row, + * reusing the OID will mean we select an OID that's already + * in the new toast table. Check for that, and if so, just + * fall through without writing the data again. + * + * While annoying and ugly-looking, this is a good thing + * because it ensures that we wind up with only one copy of + * the toast value when there is only one copy in the old + * toast table. Before we detected this case, we'd have made + * multiple copies, wasting space; and what's worse, the + * copies belonging to already-deleted heap tuples would not + * be reclaimed by VACUUM. + */ + if (toastrel_valueid_exists(toastrel, + toast_pointer.va_valueid)) + { + /* Match, so short-circuit the data storage loop below */ + data_todo = 0; + } + } + } + if (toast_pointer.va_valueid == InvalidOid) + { + /* + * new value; must choose an OID that doesn't conflict in either + * old or new toast table + */ + do + { + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } while (toastid_valueid_exists(rel->rd_toastoid, + toast_pointer.va_valueid)); + } + } + + /* + * Initialize constant parts of the tuple data + */ + t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); + t_values[2] = PointerGetDatum(&chunk_data); + t_isnull[0] = false; + t_isnull[1] = false; + t_isnull[2] = false; + + /* + * Split up the item into chunks + */ + while (data_todo > 0) + { + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Calculate the size of this chunk + */ + chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); + + /* + * Build a tuple and store it + */ + t_values[1] = Int32GetDatum(chunk_seq++); + SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); + memcpy(VARDATA(&chunk_data), data_p, chunk_size); + toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); + + heap_insert(toastrel, toasttup, mycid, options, NULL); + + /* + * Create the index entry. We cheat a little here by not using + * FormIndexDatum: this relies on the knowledge that the index columns + * are the same as the initial columns of the table for all the + * indexes. We also cheat by not providing an IndexInfo: this is okay + * for now because btree doesn't need one, but we might have to be + * more honest someday. + * + * Note also that there had better not be any user-created index on + * the TOAST table, since we don't bother to update anything else. + */ + for (i = 0; i < num_indexes; i++) + { + /* Only index relations marked as ready can be updated */ + if (toastidxs[i]->rd_index->indisready) + index_insert(toastidxs[i], t_values, t_isnull, + &(toasttup->t_self), + toastrel, + toastidxs[i]->rd_index->indisunique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + NULL); + } + + /* + * Free memory + */ + heap_freetuple(toasttup); + + /* + * Move on to next chunk + */ + data_todo -= chunk_size; + data_p += chunk_size; + } + + /* + * Done - close toast relation and its indexes + */ + toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); + table_close(toastrel, RowExclusiveLock); + + /* + * Create the TOAST pointer value that we'll return + */ + result = (struct varlena *) palloc(TOAST_POINTER_SIZE); + SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); + memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); + + return PointerGetDatum(result); +} + +/* ---------- + * toast_delete_datum - + * + * Delete a single external stored value. + * ---------- + */ +void +toast_delete_datum(Relation rel, Datum value, bool is_speculative) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + struct varatt_external toast_pointer; + Relation toastrel; + Relation *toastidxs; + ScanKeyData toastkey; + SysScanDesc toastscan; + HeapTuple toasttup; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + return; + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* + * Open the toast relation and its indexes + */ + toastrel = table_open(toast_pointer.va_toastrelid, RowExclusiveLock); + + /* Fetch valid relation used for process */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(toast_pointer.va_valueid)); + + /* + * Find all the chunks. (We don't actually care whether we see them in + * sequence or not, but since we've already locked the index we might as + * well use systable_beginscan_ordered.) + */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, 1, &toastkey); + while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + /* + * Have a chunk, delete it + */ + if (is_speculative) + heap_abort_speculative(toastrel, &toasttup->t_self); + else + simple_heap_delete(toastrel, &toasttup->t_self); + } + + /* + * End scan and close relations + */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); + table_close(toastrel, RowExclusiveLock); +} + +/* ---------- + * toastrel_valueid_exists - + * + * Test whether a toast value with the given ID exists in the toast relation. + * For safety, we consider a value to exist if there are either live or dead + * toast rows with that ID; see notes for GetNewOidWithIndex(). + * ---------- + */ +static bool +toastrel_valueid_exists(Relation toastrel, Oid valueid) +{ + bool result = false; + ScanKeyData toastkey; + SysScanDesc toastscan; + int num_indexes; + int validIndex; + Relation *toastidxs; + + /* Fetch a valid index relation */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(valueid)); + + /* + * Is there any such chunk? + */ + toastscan = systable_beginscan(toastrel, + RelationGetRelid(toastidxs[validIndex]), + true, SnapshotAny, 1, &toastkey); + + if (systable_getnext(toastscan) != NULL) + result = true; + + systable_endscan(toastscan); + + /* Clean up */ + toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); + + return result; +} + +/* ---------- + * toastid_valueid_exists - + * + * As above, but work from toast rel's OID not an open relation + * ---------- + */ +static bool +toastid_valueid_exists(Oid toastrelid, Oid valueid) +{ + bool result; + Relation toastrel; + + toastrel = table_open(toastrelid, AccessShareLock); + + result = toastrel_valueid_exists(toastrel, valueid); + + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_get_valid_index + * + * Get OID of valid index associated to given toast relation. A toast + * relation can have only one valid index at the same time. + */ +Oid +toast_get_valid_index(Oid toastoid, LOCKMODE lock) +{ + int num_indexes; + int validIndex; + Oid validIndexOid; + Relation *toastidxs; + Relation toastrel; + + /* Open the toast relation */ + toastrel = table_open(toastoid, lock); + + /* Look for the valid index of the toast relation */ + validIndex = toast_open_indexes(toastrel, + lock, + &toastidxs, + &num_indexes); + validIndexOid = RelationGetRelid(toastidxs[validIndex]); + + /* Close the toast relation and all its indexes */ + toast_close_indexes(toastidxs, num_indexes, lock); + table_close(toastrel, lock); + + return validIndexOid; +} + +/* ---------- + * toast_open_indexes + * + * Get an array of the indexes associated to the given toast relation + * and return as well the position of the valid index used by the toast + * relation in this array. It is the responsibility of the caller of this + * function to close the indexes as well as free them. + */ +int +toast_open_indexes(Relation toastrel, + LOCKMODE lock, + Relation **toastidxs, + int *num_indexes) +{ + int i = 0; + int res = 0; + bool found = false; + List *indexlist; + ListCell *lc; + + /* Get index list of the toast relation */ + indexlist = RelationGetIndexList(toastrel); + Assert(indexlist != NIL); + + *num_indexes = list_length(indexlist); + + /* Open all the index relations */ + *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); + foreach(lc, indexlist) + (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); + + /* Fetch the first valid index in list */ + for (i = 0; i < *num_indexes; i++) + { + Relation toastidx = (*toastidxs)[i]; + + if (toastidx->rd_index->indisvalid) + { + res = i; + found = true; + break; + } + } + + /* + * Free index list, not necessary anymore as relations are opened and a + * valid index has been found. + */ + list_free(indexlist); + + /* + * The toast relation should have one valid index, so something is going + * wrong if there is nothing. + */ + if (!found) + elog(ERROR, "no valid index found for toast relation with Oid %u", + RelationGetRelid(toastrel)); + + return res; +} + +/* ---------- + * toast_close_indexes + * + * Close an array of indexes for a toast relation and free it. This should + * be called for a set of indexes opened previously with toast_open_indexes. + */ +void +toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock) +{ + int i; + + /* Close relations and clean up things */ + for (i = 0; i < num_indexes; i++) + index_close(toastidxs[i], lock); + pfree(toastidxs); +} + +/* ---------- + * init_toast_snapshot + * + * Initialize an appropriate TOAST snapshot. We must use an MVCC snapshot + * to initialize the TOAST snapshot; since we don't know which one to use, + * just use the oldest one. This is safe: at worst, we will get a "snapshot + * too old" error that might have been avoided otherwise. + */ +void +init_toast_snapshot(Snapshot toast_snapshot) +{ + Snapshot snapshot = GetOldestSnapshot(); + + if (snapshot == NULL) + elog(ERROR, "no known snapshots"); + + InitToastSnapshot(*toast_snapshot, snapshot->lsn, snapshot->whenTaken); +} |