Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* extended_stats.c
|
|
|
|
* POSTGRES extended statistics
|
|
|
|
*
|
|
|
|
* Generic code supporting statistic objects created via CREATE STATISTICS.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/statistics/extended_stats.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include "access/genam.h"
|
|
|
|
#include "access/heapam.h"
|
|
|
|
#include "access/htup_details.h"
|
|
|
|
#include "catalog/indexing.h"
|
|
|
|
#include "catalog/pg_collation.h"
|
|
|
|
#include "catalog/pg_statistic_ext.h"
|
|
|
|
#include "nodes/relation.h"
|
|
|
|
#include "statistics/extended_stats_internal.h"
|
|
|
|
#include "statistics/statistics.h"
|
|
|
|
#include "utils/builtins.h"
|
|
|
|
#include "utils/fmgroids.h"
|
|
|
|
#include "utils/lsyscache.h"
|
|
|
|
#include "utils/rel.h"
|
|
|
|
#include "utils/syscache.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Used internally to refer to an individual pg_statistic_ext entry.
|
|
|
|
*/
|
|
|
|
typedef struct StatExtEntry
|
|
|
|
{
|
|
|
|
Oid statOid; /* OID of pg_statistic_ext entry */
|
|
|
|
Bitmapset *columns; /* attribute numbers covered by the statistics */
|
|
|
|
List *types; /* 'char' list of enabled statistic kinds */
|
|
|
|
} StatExtEntry;
|
|
|
|
|
|
|
|
|
|
|
|
static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
|
|
|
|
static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
|
|
|
|
int natts, VacAttrStats **vacattrstats);
|
|
|
|
static void statext_store(Relation pg_stext, Oid relid,
|
2017-04-05 18:00:42 -04:00
|
|
|
MVNDistinct *ndistinct, MVDependencies *dependencies,
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
VacAttrStats **stats);
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute requested extended stats, using the rows sampled for the plain
|
|
|
|
* (single-column) stats.
|
|
|
|
*
|
|
|
|
* This fetches a list of stats from pg_statistic_ext, computes the stats
|
|
|
|
* and serializes them back into the catalog (as bytea values).
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
BuildRelationExtStatistics(Relation onerel, double totalrows,
|
|
|
|
int numrows, HeapTuple *rows,
|
|
|
|
int natts, VacAttrStats **vacattrstats)
|
|
|
|
{
|
|
|
|
Relation pg_stext;
|
|
|
|
ListCell *lc;
|
|
|
|
List *stats;
|
|
|
|
|
|
|
|
pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock);
|
|
|
|
stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
|
|
|
|
|
|
|
|
foreach(lc, stats)
|
|
|
|
{
|
|
|
|
StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
|
|
|
|
MVNDistinct *ndistinct = NULL;
|
2017-04-05 18:00:42 -04:00
|
|
|
MVDependencies *dependencies = NULL;
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
VacAttrStats **stats;
|
|
|
|
ListCell *lc2;
|
|
|
|
|
|
|
|
/* filter only the interesting vacattrstats records */
|
|
|
|
stats = lookup_var_attr_stats(onerel, stat->columns,
|
|
|
|
natts, vacattrstats);
|
|
|
|
|
|
|
|
/* check allowed number of dimensions */
|
|
|
|
Assert(bms_num_members(stat->columns) >= 2 &&
|
|
|
|
bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
|
|
|
|
|
|
|
|
/* compute statistic of each type */
|
|
|
|
foreach(lc2, stat->types)
|
|
|
|
{
|
|
|
|
char t = (char) lfirst_int(lc2);
|
|
|
|
|
|
|
|
if (t == STATS_EXT_NDISTINCT)
|
|
|
|
ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
|
|
|
|
stat->columns, stats);
|
2017-04-05 18:00:42 -04:00
|
|
|
else if (t == STATS_EXT_DEPENDENCIES)
|
|
|
|
dependencies = statext_dependencies_build(numrows, rows,
|
|
|
|
stat->columns, stats);
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* store the statistics in the catalog */
|
2017-04-05 18:00:42 -04:00
|
|
|
statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
heap_close(pg_stext, RowExclusiveLock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* statext_is_kind_built
|
|
|
|
* Is this stat kind built in the given pg_statistic_ext tuple?
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
statext_is_kind_built(HeapTuple htup, char type)
|
|
|
|
{
|
|
|
|
AttrNumber attnum;
|
|
|
|
|
|
|
|
switch (type)
|
|
|
|
{
|
|
|
|
case STATS_EXT_NDISTINCT:
|
|
|
|
attnum = Anum_pg_statistic_ext_standistinct;
|
|
|
|
break;
|
|
|
|
|
2017-04-05 18:00:42 -04:00
|
|
|
case STATS_EXT_DEPENDENCIES:
|
|
|
|
attnum = Anum_pg_statistic_ext_stadependencies;
|
|
|
|
break;
|
|
|
|
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
default:
|
|
|
|
elog(ERROR, "unexpected statistics type requested: %d", type);
|
|
|
|
}
|
|
|
|
|
|
|
|
return !heap_attisnull(htup, attnum);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return a list (of StatExtEntry) of statistics for the given relation.
|
|
|
|
*/
|
|
|
|
static List *
|
|
|
|
fetch_statentries_for_relation(Relation pg_statext, Oid relid)
|
|
|
|
{
|
|
|
|
SysScanDesc scan;
|
|
|
|
ScanKeyData skey;
|
|
|
|
HeapTuple htup;
|
|
|
|
List *result = NIL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare to scan pg_statistic_ext for entries having indrelid = this
|
|
|
|
* rel.
|
|
|
|
*/
|
|
|
|
ScanKeyInit(&skey,
|
|
|
|
Anum_pg_statistic_ext_starelid,
|
|
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
|
|
|
ObjectIdGetDatum(relid));
|
|
|
|
|
|
|
|
scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
|
|
|
|
NULL, 1, &skey);
|
|
|
|
|
|
|
|
while (HeapTupleIsValid(htup = systable_getnext(scan)))
|
|
|
|
{
|
|
|
|
StatExtEntry *entry;
|
|
|
|
Datum datum;
|
|
|
|
bool isnull;
|
|
|
|
int i;
|
|
|
|
ArrayType *arr;
|
|
|
|
char *enabled;
|
|
|
|
Form_pg_statistic_ext staForm;
|
|
|
|
|
|
|
|
entry = palloc0(sizeof(StatExtEntry));
|
|
|
|
entry->statOid = HeapTupleGetOid(htup);
|
|
|
|
staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
|
|
|
|
for (i = 0; i < staForm->stakeys.dim1; i++)
|
|
|
|
{
|
|
|
|
entry->columns = bms_add_member(entry->columns,
|
|
|
|
staForm->stakeys.values[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* decode the staenabled char array into a list of chars */
|
|
|
|
datum = SysCacheGetAttr(STATEXTOID, htup,
|
|
|
|
Anum_pg_statistic_ext_staenabled, &isnull);
|
|
|
|
Assert(!isnull);
|
|
|
|
arr = DatumGetArrayTypeP(datum);
|
|
|
|
if (ARR_NDIM(arr) != 1 ||
|
|
|
|
ARR_HASNULL(arr) ||
|
|
|
|
ARR_ELEMTYPE(arr) != CHAROID)
|
|
|
|
elog(ERROR, "staenabled is not a 1-D char array");
|
|
|
|
enabled = (char *) ARR_DATA_PTR(arr);
|
|
|
|
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
|
|
|
|
{
|
2017-04-05 18:00:42 -04:00
|
|
|
Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
|
|
|
|
(enabled[i] == STATS_EXT_DEPENDENCIES));
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
entry->types = lappend_int(entry->types, (int) enabled[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
result = lappend(result, entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
systable_endscan(scan);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Using 'vacattrstats' of size 'natts' as input data, return a newly built
|
|
|
|
* VacAttrStats array which includes only the items corresponding to attributes
|
|
|
|
* indicated by 'attrs'.
|
|
|
|
*/
|
|
|
|
static VacAttrStats **
|
|
|
|
lookup_var_attr_stats(Relation rel, Bitmapset *attrs, int natts,
|
|
|
|
VacAttrStats **vacattrstats)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
int x = -1;
|
|
|
|
VacAttrStats **stats;
|
|
|
|
Bitmapset *matched = NULL;
|
|
|
|
|
|
|
|
stats = (VacAttrStats **)
|
|
|
|
palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
|
|
|
|
|
|
|
|
/* lookup VacAttrStats info for the requested columns (same attnum) */
|
|
|
|
while ((x = bms_next_member(attrs, x)) >= 0)
|
|
|
|
{
|
|
|
|
int j;
|
|
|
|
|
|
|
|
stats[i] = NULL;
|
|
|
|
for (j = 0; j < natts; j++)
|
|
|
|
{
|
|
|
|
if (x == vacattrstats[j]->tupattnum)
|
|
|
|
{
|
|
|
|
stats[i] = vacattrstats[j];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!stats[i])
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
|
|
|
|
errmsg("extended statistics could not be collected for column \"%s\" of relation %s.%s",
|
|
|
|
NameStr(RelationGetDescr(rel)->attrs[x - 1]->attname),
|
|
|
|
get_namespace_name(rel->rd_rel->relnamespace),
|
|
|
|
RelationGetRelationName(rel)),
|
|
|
|
errhint("Consider ALTER TABLE \"%s\".\"%s\" ALTER \"%s\" SET STATISTICS -1",
|
|
|
|
get_namespace_name(rel->rd_rel->relnamespace),
|
|
|
|
RelationGetRelationName(rel),
|
|
|
|
NameStr(RelationGetDescr(rel)->attrs[x - 1]->attname))));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check that we found a non-dropped column and that the attnum
|
|
|
|
* matches.
|
|
|
|
*/
|
|
|
|
Assert(!stats[i]->attr->attisdropped);
|
|
|
|
matched = bms_add_member(matched, stats[i]->tupattnum);
|
|
|
|
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
if (bms_subset_compare(matched, attrs) != BMS_EQUAL)
|
|
|
|
elog(ERROR, "could not find all attributes in attribute stats array");
|
|
|
|
bms_free(matched);
|
|
|
|
|
|
|
|
return stats;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* statext_store
|
|
|
|
* Serializes the statistics and stores them into the pg_statistic_ext tuple.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
statext_store(Relation pg_stext, Oid statOid,
|
2017-04-05 18:00:42 -04:00
|
|
|
MVNDistinct *ndistinct, MVDependencies *dependencies,
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
VacAttrStats **stats)
|
|
|
|
{
|
|
|
|
HeapTuple stup,
|
|
|
|
oldtup;
|
|
|
|
Datum values[Natts_pg_statistic_ext];
|
|
|
|
bool nulls[Natts_pg_statistic_ext];
|
|
|
|
bool replaces[Natts_pg_statistic_ext];
|
|
|
|
|
|
|
|
memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool));
|
|
|
|
memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool));
|
|
|
|
memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Construct a new pg_statistic_ext tuple, replacing the calculated stats.
|
|
|
|
*/
|
|
|
|
if (ndistinct != NULL)
|
|
|
|
{
|
|
|
|
bytea *data = statext_ndistinct_serialize(ndistinct);
|
|
|
|
|
|
|
|
nulls[Anum_pg_statistic_ext_standistinct - 1] = (data == NULL);
|
|
|
|
values[Anum_pg_statistic_ext_standistinct - 1] = PointerGetDatum(data);
|
|
|
|
}
|
|
|
|
|
2017-04-05 18:00:42 -04:00
|
|
|
if (dependencies != NULL)
|
|
|
|
{
|
|
|
|
bytea *data = statext_dependencies_serialize(dependencies);
|
|
|
|
|
|
|
|
nulls[Anum_pg_statistic_ext_stadependencies - 1] = (data == NULL);
|
|
|
|
values[Anum_pg_statistic_ext_stadependencies - 1] = PointerGetDatum(data);
|
|
|
|
}
|
|
|
|
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
/* always replace the value (either by bytea or NULL) */
|
|
|
|
replaces[Anum_pg_statistic_ext_standistinct - 1] = true;
|
2017-04-05 18:00:42 -04:00
|
|
|
replaces[Anum_pg_statistic_ext_stadependencies - 1] = true;
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 14:06:10 -03:00
|
|
|
|
|
|
|
/* there should already be a pg_statistic_ext tuple */
|
|
|
|
oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
|
|
|
|
if (!HeapTupleIsValid(oldtup))
|
|
|
|
elog(ERROR, "cache lookup failed for extended statistics %u", statOid);
|
|
|
|
|
|
|
|
/* replace it */
|
|
|
|
stup = heap_modify_tuple(oldtup,
|
|
|
|
RelationGetDescr(pg_stext),
|
|
|
|
values,
|
|
|
|
nulls,
|
|
|
|
replaces);
|
|
|
|
ReleaseSysCache(oldtup);
|
|
|
|
CatalogTupleUpdate(pg_stext, &stup->t_self, stup);
|
|
|
|
|
|
|
|
heap_freetuple(stup);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* initialize multi-dimensional sort */
|
|
|
|
MultiSortSupport
|
|
|
|
multi_sort_init(int ndims)
|
|
|
|
{
|
|
|
|
MultiSortSupport mss;
|
|
|
|
|
|
|
|
Assert(ndims >= 2);
|
|
|
|
|
|
|
|
mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup)
|
|
|
|
+sizeof(SortSupportData) * ndims);
|
|
|
|
|
|
|
|
mss->ndims = ndims;
|
|
|
|
|
|
|
|
return mss;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prepare sort support info using the given sort operator
|
|
|
|
* at the position 'sortdim'
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
multi_sort_add_dimension(MultiSortSupport mss, int sortdim, Oid oper)
|
|
|
|
{
|
|
|
|
SortSupport ssup = &mss->ssup[sortdim];
|
|
|
|
|
|
|
|
ssup->ssup_cxt = CurrentMemoryContext;
|
|
|
|
ssup->ssup_collation = DEFAULT_COLLATION_OID;
|
|
|
|
ssup->ssup_nulls_first = false;
|
|
|
|
ssup->ssup_cxt = CurrentMemoryContext;
|
|
|
|
|
|
|
|
PrepareSortSupportFromOrderingOp(oper, ssup);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* compare all the dimensions in the selected order */
|
|
|
|
int
|
|
|
|
multi_sort_compare(const void *a, const void *b, void *arg)
|
|
|
|
{
|
|
|
|
MultiSortSupport mss = (MultiSortSupport) arg;
|
|
|
|
SortItem *ia = (SortItem *) a;
|
|
|
|
SortItem *ib = (SortItem *) b;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < mss->ndims; i++)
|
|
|
|
{
|
|
|
|
int compare;
|
|
|
|
|
|
|
|
compare = ApplySortComparator(ia->values[i], ia->isnull[i],
|
|
|
|
ib->values[i], ib->isnull[i],
|
|
|
|
&mss->ssup[i]);
|
|
|
|
|
|
|
|
if (compare != 0)
|
|
|
|
return compare;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* equal by default */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* compare selected dimension */
|
|
|
|
int
|
|
|
|
multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
|
|
|
|
MultiSortSupport mss)
|
|
|
|
{
|
|
|
|
return ApplySortComparator(a->values[dim], a->isnull[dim],
|
|
|
|
b->values[dim], b->isnull[dim],
|
|
|
|
&mss->ssup[dim]);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
multi_sort_compare_dims(int start, int end,
|
|
|
|
const SortItem *a, const SortItem *b,
|
|
|
|
MultiSortSupport mss)
|
|
|
|
{
|
|
|
|
int dim;
|
|
|
|
|
|
|
|
for (dim = start; dim <= end; dim++)
|
|
|
|
{
|
|
|
|
int r = ApplySortComparator(a->values[dim], a->isnull[dim],
|
|
|
|
b->values[dim], b->isnull[dim],
|
|
|
|
&mss->ssup[dim]);
|
|
|
|
|
|
|
|
if (r != 0)
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2017-04-05 18:00:42 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* has_stats_of_kind
|
2017-04-06 12:27:15 -03:00
|
|
|
* Check that the list contains statistic of a given kind
|
2017-04-05 18:00:42 -04:00
|
|
|
*/
|
|
|
|
bool
|
|
|
|
has_stats_of_kind(List *stats, char requiredkind)
|
|
|
|
{
|
|
|
|
ListCell *l;
|
|
|
|
|
|
|
|
foreach(l, stats)
|
|
|
|
{
|
|
|
|
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
|
|
|
|
|
|
|
|
if (stat->kind == requiredkind)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* choose_best_statistics
|
2017-04-06 12:27:15 -03:00
|
|
|
* Look for and return statistics with the specified 'requiredkind' which
|
|
|
|
* have keys that match at least two of the given attnums. Return NULL if
|
|
|
|
* there's no match.
|
2017-04-05 18:00:42 -04:00
|
|
|
*
|
|
|
|
* The current selection criteria is very simple - we choose the statistics
|
|
|
|
* referencing the most attributes with the least keys.
|
|
|
|
*
|
|
|
|
* XXX if multiple statistics exists of the same size matching the same number
|
|
|
|
* of keys, then the statistics which are chosen depend on the order that they
|
|
|
|
* appear in the stats list. Perhaps this needs to be more definitive.
|
|
|
|
*/
|
|
|
|
StatisticExtInfo *
|
|
|
|
choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
|
|
|
|
{
|
|
|
|
ListCell *lc;
|
|
|
|
StatisticExtInfo *best_match = NULL;
|
|
|
|
int best_num_matched = 2; /* goal #1: maximize */
|
|
|
|
int best_match_keys = (STATS_MAX_DIMENSIONS + 1); /* goal #2: minimize */
|
|
|
|
|
|
|
|
foreach(lc, stats)
|
|
|
|
{
|
|
|
|
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
|
|
|
|
int num_matched;
|
|
|
|
int numkeys;
|
|
|
|
Bitmapset *matched;
|
|
|
|
|
|
|
|
/* skip statistics that are not the correct type */
|
|
|
|
if (info->kind != requiredkind)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* determine how many attributes of these stats can be matched to */
|
|
|
|
matched = bms_intersect(attnums, info->keys);
|
|
|
|
num_matched = bms_num_members(matched);
|
|
|
|
bms_free(matched);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* save the actual number of keys in the stats so that we can choose
|
|
|
|
* the narrowest stats with the most matching keys.
|
|
|
|
*/
|
|
|
|
numkeys = bms_num_members(info->keys);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use these statistics when it increases the number of matched
|
|
|
|
* clauses or when it matches the same number of attributes but these
|
|
|
|
* stats have fewer keys than any previous match.
|
|
|
|
*/
|
|
|
|
if (num_matched > best_num_matched ||
|
|
|
|
(num_matched == best_num_matched && numkeys < best_match_keys))
|
|
|
|
{
|
|
|
|
best_match = info;
|
|
|
|
best_num_matched = num_matched;
|
|
|
|
best_match_keys = numkeys;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return best_match;
|
|
|
|
}
|