diff options
Diffstat (limited to 'src/backend/access/common/toast_internals.c')
-rw-r--r-- | src/backend/access/common/toast_internals.c | 673 |
1 files changed, 673 insertions, 0 deletions
diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c new file mode 100644 index 0000000..576e585 --- /dev/null +++ b/src/backend/access/common/toast_internals.c @@ -0,0 +1,673 @@ +/*------------------------------------------------------------------------- + * + * toast_internals.c + * Functions for internal use by the TOAST system. + * + * Copyright (c) 2000-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/common/toast_internals.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heaptoast.h" +#include "access/table.h" +#include "access/toast_internals.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "common/pg_lzcompress.h" +#include "miscadmin.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +static bool toastrel_valueid_exists(Relation toastrel, Oid valueid); +static bool toastid_valueid_exists(Oid toastrelid, Oid valueid); + +/* ---------- + * toast_compress_datum - + * + * Create a compressed version of a varlena datum + * + * If we fail (ie, compressed result is actually bigger than original) + * then return NULL. We must not use compressed data if it'd expand + * the tuple! + * + * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without + * copying them. But we can't handle external or compressed datums. + * ---------- + */ +Datum +toast_compress_datum(Datum value, char cmethod) +{ + struct varlena *tmp = NULL; + int32 valsize; + ToastCompressionId cmid = TOAST_INVALID_COMPRESSION_ID; + + Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); + Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); + + valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); + + /* If the compression method is not valid, use the current default */ + if (!CompressionMethodIsValid(cmethod)) + cmethod = default_toast_compression; + + /* + * Call appropriate compression routine for the compression method. + */ + switch (cmethod) + { + case TOAST_PGLZ_COMPRESSION: + tmp = pglz_compress_datum((const struct varlena *) value); + cmid = TOAST_PGLZ_COMPRESSION_ID; + break; + case TOAST_LZ4_COMPRESSION: + tmp = lz4_compress_datum((const struct varlena *) value); + cmid = TOAST_LZ4_COMPRESSION_ID; + break; + default: + elog(ERROR, "invalid compression method %c", cmethod); + } + + if (tmp == NULL) + return PointerGetDatum(NULL); + + /* + * We recheck the actual size even if compression reports success, because + * it might be satisfied with having saved as little as one byte in the + * compressed data --- which could turn into a net loss once you consider + * header and alignment padding. Worst case, the compressed format might + * require three padding bytes (plus header, which is included in + * VARSIZE(tmp)), whereas the uncompressed format would take only one + * header byte and no padding if the value is short enough. So we insist + * on a savings of more than 2 bytes to ensure we have a gain. + */ + if (VARSIZE(tmp) < valsize - 2) + { + /* successful compression */ + Assert(cmid != TOAST_INVALID_COMPRESSION_ID); + TOAST_COMPRESS_SET_SIZE_AND_COMPRESS_METHOD(tmp, valsize, cmid); + return PointerGetDatum(tmp); + } + else + { + /* incompressible data */ + pfree(tmp); + return PointerGetDatum(NULL); + } +} + +/* ---------- + * toast_save_datum - + * + * Save one single datum into the secondary relation and return + * a Datum reference for it. + * + * rel: the main relation we're working with (not the toast rel!) + * value: datum to be pushed to toast storage + * oldexternal: if not NULL, toast pointer previously representing the datum + * options: options to be passed to heap_insert() for toast rows + * ---------- + */ +Datum +toast_save_datum(Relation rel, Datum value, + struct varlena *oldexternal, int options) +{ + Relation toastrel; + Relation *toastidxs; + HeapTuple toasttup; + TupleDesc toasttupDesc; + Datum t_values[3]; + bool t_isnull[3]; + CommandId mycid = GetCurrentCommandId(true); + struct varlena *result; + struct varatt_external toast_pointer; + union + { + struct varlena hdr; + /* this is to make the union big enough for a chunk: */ + char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; + /* ensure union is aligned well enough: */ + int32 align_it; + } chunk_data; + int32 chunk_size; + int32 chunk_seq = 0; + char *data_p; + int32 data_todo; + Pointer dval = DatumGetPointer(value); + int num_indexes; + int validIndex; + + Assert(!VARATT_IS_EXTERNAL(value)); + + /* + * Open the toast relation and its indexes. We can use the index to check + * uniqueness of the OID we assign to the toasted item, even though it has + * additional columns besides OID. + */ + toastrel = table_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); + toasttupDesc = toastrel->rd_att; + + /* Open all the toast indexes and look for the valid one */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Get the data pointer and length, and compute va_rawsize and va_extinfo. + * + * va_rawsize is the size of the equivalent fully uncompressed datum, so + * we have to adjust for short headers. + * + * va_extinfo stored the actual size of the data payload in the toast + * records and the compression method in first 2 bits if data is + * compressed. + */ + if (VARATT_IS_SHORT(dval)) + { + data_p = VARDATA_SHORT(dval); + data_todo = VARSIZE_SHORT(dval) - VARHDRSZ_SHORT; + toast_pointer.va_rawsize = data_todo + VARHDRSZ; /* as if not short */ + toast_pointer.va_extinfo = data_todo; + } + else if (VARATT_IS_COMPRESSED(dval)) + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + /* rawsize in a compressed datum is just the size of the payload */ + toast_pointer.va_rawsize = VARDATA_COMPRESSED_GET_EXTSIZE(dval) + VARHDRSZ; + + /* set external size and compression method */ + VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, data_todo, + VARDATA_COMPRESSED_GET_COMPRESS_METHOD(dval)); + /* Assert that the numbers look like it's compressed */ + Assert(VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); + } + else + { + data_p = VARDATA(dval); + data_todo = VARSIZE(dval) - VARHDRSZ; + toast_pointer.va_rawsize = VARSIZE(dval); + toast_pointer.va_extinfo = data_todo; + } + + /* + * Insert the correct table OID into the result TOAST pointer. + * + * Normally this is the actual OID of the target toast table, but during + * table-rewriting operations such as CLUSTER, we have to insert the OID + * of the table's real permanent toast table instead. rd_toastoid is set + * if we have to substitute such an OID. + */ + if (OidIsValid(rel->rd_toastoid)) + toast_pointer.va_toastrelid = rel->rd_toastoid; + else + toast_pointer.va_toastrelid = RelationGetRelid(toastrel); + + /* + * Choose an OID to use as the value ID for this toast value. + * + * Normally we just choose an unused OID within the toast table. But + * during table-rewriting operations where we are preserving an existing + * toast table OID, we want to preserve toast value OIDs too. So, if + * rd_toastoid is set and we had a prior external value from that same + * toast table, re-use its value ID. If we didn't have a prior external + * value (which is a corner case, but possible if the table's attstorage + * options have been changed), we have to pick a value ID that doesn't + * conflict with either new or existing toast value OIDs. + */ + if (!OidIsValid(rel->rd_toastoid)) + { + /* normal case: just choose an unused OID */ + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } + else + { + /* rewrite case: check to see if value was in old toast table */ + toast_pointer.va_valueid = InvalidOid; + if (oldexternal != NULL) + { + struct varatt_external old_toast_pointer; + + Assert(VARATT_IS_EXTERNAL_ONDISK(oldexternal)); + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(old_toast_pointer, oldexternal); + if (old_toast_pointer.va_toastrelid == rel->rd_toastoid) + { + /* This value came from the old toast table; reuse its OID */ + toast_pointer.va_valueid = old_toast_pointer.va_valueid; + + /* + * There is a corner case here: the table rewrite might have + * to copy both live and recently-dead versions of a row, and + * those versions could easily reference the same toast value. + * When we copy the second or later version of such a row, + * reusing the OID will mean we select an OID that's already + * in the new toast table. Check for that, and if so, just + * fall through without writing the data again. + * + * While annoying and ugly-looking, this is a good thing + * because it ensures that we wind up with only one copy of + * the toast value when there is only one copy in the old + * toast table. Before we detected this case, we'd have made + * multiple copies, wasting space; and what's worse, the + * copies belonging to already-deleted heap tuples would not + * be reclaimed by VACUUM. + */ + if (toastrel_valueid_exists(toastrel, + toast_pointer.va_valueid)) + { + /* Match, so short-circuit the data storage loop below */ + data_todo = 0; + } + } + } + if (toast_pointer.va_valueid == InvalidOid) + { + /* + * new value; must choose an OID that doesn't conflict in either + * old or new toast table + */ + do + { + toast_pointer.va_valueid = + GetNewOidWithIndex(toastrel, + RelationGetRelid(toastidxs[validIndex]), + (AttrNumber) 1); + } while (toastid_valueid_exists(rel->rd_toastoid, + toast_pointer.va_valueid)); + } + } + + /* + * Initialize constant parts of the tuple data + */ + t_values[0] = ObjectIdGetDatum(toast_pointer.va_valueid); + t_values[2] = PointerGetDatum(&chunk_data); + t_isnull[0] = false; + t_isnull[1] = false; + t_isnull[2] = false; + + /* + * Split up the item into chunks + */ + while (data_todo > 0) + { + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Calculate the size of this chunk + */ + chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); + + /* + * Build a tuple and store it + */ + t_values[1] = Int32GetDatum(chunk_seq++); + SET_VARSIZE(&chunk_data, chunk_size + VARHDRSZ); + memcpy(VARDATA(&chunk_data), data_p, chunk_size); + toasttup = heap_form_tuple(toasttupDesc, t_values, t_isnull); + + heap_insert(toastrel, toasttup, mycid, options, NULL); + + /* + * Create the index entry. We cheat a little here by not using + * FormIndexDatum: this relies on the knowledge that the index columns + * are the same as the initial columns of the table for all the + * indexes. We also cheat by not providing an IndexInfo: this is okay + * for now because btree doesn't need one, but we might have to be + * more honest someday. + * + * Note also that there had better not be any user-created index on + * the TOAST table, since we don't bother to update anything else. + */ + for (i = 0; i < num_indexes; i++) + { + /* Only index relations marked as ready can be updated */ + if (toastidxs[i]->rd_index->indisready) + index_insert(toastidxs[i], t_values, t_isnull, + &(toasttup->t_self), + toastrel, + toastidxs[i]->rd_index->indisunique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + false, NULL); + } + + /* + * Free memory + */ + heap_freetuple(toasttup); + + /* + * Move on to next chunk + */ + data_todo -= chunk_size; + data_p += chunk_size; + } + + /* + * Done - close toast relation and its indexes but keep the lock until + * commit, so as a concurrent reindex done directly on the toast relation + * would be able to wait for this transaction. + */ + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); + + /* + * Create the TOAST pointer value that we'll return + */ + result = (struct varlena *) palloc(TOAST_POINTER_SIZE); + SET_VARTAG_EXTERNAL(result, VARTAG_ONDISK); + memcpy(VARDATA_EXTERNAL(result), &toast_pointer, sizeof(toast_pointer)); + + return PointerGetDatum(result); +} + +/* ---------- + * toast_delete_datum - + * + * Delete a single external stored value. + * ---------- + */ +void +toast_delete_datum(Relation rel, Datum value, bool is_speculative) +{ + struct varlena *attr = (struct varlena *) DatumGetPointer(value); + struct varatt_external toast_pointer; + Relation toastrel; + Relation *toastidxs; + ScanKeyData toastkey; + SysScanDesc toastscan; + HeapTuple toasttup; + int num_indexes; + int validIndex; + SnapshotData SnapshotToast; + + if (!VARATT_IS_EXTERNAL_ONDISK(attr)) + return; + + /* Must copy to access aligned fields */ + VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); + + /* + * Open the toast relation and its indexes + */ + toastrel = table_open(toast_pointer.va_toastrelid, RowExclusiveLock); + + /* Fetch valid relation used for process */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(toast_pointer.va_valueid)); + + /* + * Find all the chunks. (We don't actually care whether we see them in + * sequence or not, but since we've already locked the index we might as + * well use systable_beginscan_ordered.) + */ + init_toast_snapshot(&SnapshotToast); + toastscan = systable_beginscan_ordered(toastrel, toastidxs[validIndex], + &SnapshotToast, 1, &toastkey); + while ((toasttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) + { + /* + * Have a chunk, delete it + */ + if (is_speculative) + heap_abort_speculative(toastrel, &toasttup->t_self); + else + simple_heap_delete(toastrel, &toasttup->t_self); + } + + /* + * End scan and close relations but keep the lock until commit, so as a + * concurrent reindex done directly on the toast relation would be able to + * wait for this transaction. + */ + systable_endscan_ordered(toastscan); + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); +} + +/* ---------- + * toastrel_valueid_exists - + * + * Test whether a toast value with the given ID exists in the toast relation. + * For safety, we consider a value to exist if there are either live or dead + * toast rows with that ID; see notes for GetNewOidWithIndex(). + * ---------- + */ +static bool +toastrel_valueid_exists(Relation toastrel, Oid valueid) +{ + bool result = false; + ScanKeyData toastkey; + SysScanDesc toastscan; + int num_indexes; + int validIndex; + Relation *toastidxs; + + /* Fetch a valid index relation */ + validIndex = toast_open_indexes(toastrel, + RowExclusiveLock, + &toastidxs, + &num_indexes); + + /* + * Setup a scan key to find chunks with matching va_valueid + */ + ScanKeyInit(&toastkey, + (AttrNumber) 1, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(valueid)); + + /* + * Is there any such chunk? + */ + toastscan = systable_beginscan(toastrel, + RelationGetRelid(toastidxs[validIndex]), + true, SnapshotAny, 1, &toastkey); + + if (systable_getnext(toastscan) != NULL) + result = true; + + systable_endscan(toastscan); + + /* Clean up */ + toast_close_indexes(toastidxs, num_indexes, RowExclusiveLock); + + return result; +} + +/* ---------- + * toastid_valueid_exists - + * + * As above, but work from toast rel's OID not an open relation + * ---------- + */ +static bool +toastid_valueid_exists(Oid toastrelid, Oid valueid) +{ + bool result; + Relation toastrel; + + toastrel = table_open(toastrelid, AccessShareLock); + + result = toastrel_valueid_exists(toastrel, valueid); + + table_close(toastrel, AccessShareLock); + + return result; +} + +/* ---------- + * toast_get_valid_index + * + * Get OID of valid index associated to given toast relation. A toast + * relation can have only one valid index at the same time. + */ +Oid +toast_get_valid_index(Oid toastoid, LOCKMODE lock) +{ + int num_indexes; + int validIndex; + Oid validIndexOid; + Relation *toastidxs; + Relation toastrel; + + /* Open the toast relation */ + toastrel = table_open(toastoid, lock); + + /* Look for the valid index of the toast relation */ + validIndex = toast_open_indexes(toastrel, + lock, + &toastidxs, + &num_indexes); + validIndexOid = RelationGetRelid(toastidxs[validIndex]); + + /* Close the toast relation and all its indexes */ + toast_close_indexes(toastidxs, num_indexes, NoLock); + table_close(toastrel, NoLock); + + return validIndexOid; +} + +/* ---------- + * toast_open_indexes + * + * Get an array of the indexes associated to the given toast relation + * and return as well the position of the valid index used by the toast + * relation in this array. It is the responsibility of the caller of this + * function to close the indexes as well as free them. + */ +int +toast_open_indexes(Relation toastrel, + LOCKMODE lock, + Relation **toastidxs, + int *num_indexes) +{ + int i = 0; + int res = 0; + bool found = false; + List *indexlist; + ListCell *lc; + + /* Get index list of the toast relation */ + indexlist = RelationGetIndexList(toastrel); + Assert(indexlist != NIL); + + *num_indexes = list_length(indexlist); + + /* Open all the index relations */ + *toastidxs = (Relation *) palloc(*num_indexes * sizeof(Relation)); + foreach(lc, indexlist) + (*toastidxs)[i++] = index_open(lfirst_oid(lc), lock); + + /* Fetch the first valid index in list */ + for (i = 0; i < *num_indexes; i++) + { + Relation toastidx = (*toastidxs)[i]; + + if (toastidx->rd_index->indisvalid) + { + res = i; + found = true; + break; + } + } + + /* + * Free index list, not necessary anymore as relations are opened and a + * valid index has been found. + */ + list_free(indexlist); + + /* + * The toast relation should have one valid index, so something is going + * wrong if there is nothing. + */ + if (!found) + elog(ERROR, "no valid index found for toast relation with Oid %u", + RelationGetRelid(toastrel)); + + return res; +} + +/* ---------- + * toast_close_indexes + * + * Close an array of indexes for a toast relation and free it. This should + * be called for a set of indexes opened previously with toast_open_indexes. + */ +void +toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock) +{ + int i; + + /* Close relations and clean up things */ + for (i = 0; i < num_indexes; i++) + index_close(toastidxs[i], lock); + pfree(toastidxs); +} + +/* ---------- + * init_toast_snapshot + * + * Initialize an appropriate TOAST snapshot. We must use an MVCC snapshot + * to initialize the TOAST snapshot; since we don't know which one to use, + * just use the oldest one. This is safe: at worst, we will get a "snapshot + * too old" error that might have been avoided otherwise. + */ +void +init_toast_snapshot(Snapshot toast_snapshot) +{ + Snapshot snapshot = GetOldestSnapshot(); + + /* + * GetOldestSnapshot returns NULL if the session has no active snapshots. + * We can get that if, for example, a procedure fetches a toasted value + * into a local variable, commits, and then tries to detoast the value. + * Such coding is unsafe, because once we commit there is nothing to + * prevent the toast data from being deleted. Detoasting *must* happen in + * the same transaction that originally fetched the toast pointer. Hence, + * rather than trying to band-aid over the problem, throw an error. (This + * is not very much protection, because in many scenarios the procedure + * would have already created a new transaction snapshot, preventing us + * from detecting the problem. But it's better than nothing, and for sure + * we shouldn't expend code on masking the problem more.) + */ + if (snapshot == NULL) + elog(ERROR, "cannot fetch toast data without an active snapshot"); + + /* + * Catalog snapshots can be returned by GetOldestSnapshot() even if not + * registered or active. That easily hides bugs around not having a + * snapshot set up - most of the time there is a valid catalog snapshot. + * So additionally insist that the current snapshot is registered or + * active. + */ + Assert(HaveRegisteredOrActiveSnapshot()); + + InitToastSnapshot(*toast_snapshot, snapshot->lsn, snapshot->whenTaken); +} |