postgres/contrib/pg_upgrade/pg_upgrade.c

829 lines
24 KiB
C
Raw Permalink Normal View History

/*
* pg_upgrade.c
*
* main source file
*
* Copyright (c) 2010-2014, PostgreSQL Global Development Group
2010-09-20 22:08:53 +02:00
* contrib/pg_upgrade/pg_upgrade.c
*/
/*
* To simplify the upgrade process, we force certain system values to be
* identical between old and new clusters:
*
* We control all assignments of pg_class.oid (and relfilenode) so toast
* oids are the same between old and new clusters. This is important
* because toast oids are stored as toast pointers in user tables.
*
* While pg_class.oid and pg_class.relfilenode are initially the same
* in a cluster, they can diverge due to CLUSTER, REINDEX, or VACUUM
* FULL. In the new cluster, pg_class.oid and pg_class.relfilenode will
* be the same and will match the old pg_class.oid value. Because of
* this, old/new pg_class.relfilenode values will not match if CLUSTER,
* REINDEX, or VACUUM FULL have been performed in the old cluster.
*
* We control all assignments of pg_type.oid because these oids are stored
* in user composite type values.
*
* We control all assignments of pg_enum.oid because these oids are stored
* in user tables as enum values.
*
* We control all assignments of pg_authid.oid because these oids are stored
* in pg_largeobject_metadata.
*/
#include "postgres_fe.h"
2011-04-10 11:42:00 -04:00
#include "pg_upgrade.h"
#ifdef HAVE_LANGINFO_H
#include <langinfo.h>
#endif
static void prepare_new_cluster(void);
static void prepare_new_databases(void);
static void create_new_objects(void);
static void copy_clog_xlog_xid(void);
static void set_frozenxids(bool minmxid_only);
static void setup(char *argv0, bool *live_check);
static void cleanup(void);
static void get_restricted_token(const char *progname);
#ifdef WIN32
static int CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, const char *progname);
#endif
2011-04-10 11:42:00 -04:00
ClusterInfo old_cluster,
new_cluster;
OSInfo os_info;
char *output_files[] = {
SERVER_LOG_FILE,
#ifdef WIN32
/* unique file for pg_ctl start */
SERVER_START_LOG_FILE,
#endif
UTILITY_LOG_FILE,
INTERNAL_LOG_FILE,
NULL
};
#ifdef WIN32
static char *restrict_env;
#endif
int
main(int argc, char **argv)
{
char *sequence_script_file_name = NULL;
char *analyze_script_file_name = NULL;
char *deletion_script_file_name = NULL;
bool live_check = false;
/* Ensure that all files created by pg_upgrade are non-world-readable */
umask(S_IRWXG | S_IRWXO);
parseCommandLine(argc, argv);
get_restricted_token(os_info.progname);
adjust_data_dir(&old_cluster);
adjust_data_dir(&new_cluster);
setup(argv[0], &live_check);
output_check_banner(live_check);
check_cluster_versions();
get_sock_dir(&old_cluster, live_check);
get_sock_dir(&new_cluster, false);
check_cluster_compatibility(live_check);
check_and_dump_old_cluster(live_check, &sequence_script_file_name);
/* -- NEW -- */
start_postmaster(&new_cluster, true);
check_new_cluster();
report_clusters_compatible();
pg_log(PG_REPORT, "\nPerforming Upgrade\n");
pg_log(PG_REPORT, "------------------\n");
prepare_new_cluster();
stop_postmaster(false);
/*
* Destructive Changes to New Cluster
*/
copy_clog_xlog_xid();
/* New now using xids of the old system */
/* -- NEW -- */
start_postmaster(&new_cluster, true);
prepare_new_databases();
create_new_objects();
stop_postmaster(false);
/*
* Most failures happen in create_new_objects(), which has completed at
* this point. We do this here because it is just before linking, which
* will link the old and new cluster data files, preventing the old
* cluster from being safely started once the new cluster is started.
*/
if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
disable_old_cluster();
transfer_all_new_tablespaces(&old_cluster.dbarr, &new_cluster.dbarr,
old_cluster.pgdata, new_cluster.pgdata);
/*
* Assuming OIDs are only used in system tables, there is no need to
* restore the OID counter because we have not transferred any OIDs from
* the old system, but we do it anyway just in case. We do it late here
* because there is no need to have the schema load use new oids.
*/
prep_status("Setting next OID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/pg_resetxlog\" -o %u \"%s\"",
new_cluster.bindir, old_cluster.controldata.chkpnt_nxtoid,
new_cluster.pgdata);
check_ok();
prep_status("Sync data directory to disk");
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/initdb\" --sync-only \"%s\"", new_cluster.bindir,
new_cluster.pgdata);
check_ok();
create_script_for_cluster_analyze(&analyze_script_file_name);
create_script_for_old_cluster_deletion(&deletion_script_file_name);
issue_warnings_and_set_wal_level(sequence_script_file_name);
pg_log(PG_REPORT, "\nUpgrade Complete\n");
pg_log(PG_REPORT, "----------------\n");
output_completion_banner(analyze_script_file_name,
deletion_script_file_name);
pg_free(analyze_script_file_name);
pg_free(deletion_script_file_name);
pg_free(sequence_script_file_name);
cleanup();
return 0;
}
#ifdef WIN32
typedef BOOL(WINAPI * __CreateRestrictedToken) (HANDLE, DWORD, DWORD, PSID_AND_ATTRIBUTES, DWORD, PLUID_AND_ATTRIBUTES, DWORD, PSID_AND_ATTRIBUTES, PHANDLE);
/* Windows API define missing from some versions of MingW headers */
#ifndef DISABLE_MAX_PRIVILEGE
#define DISABLE_MAX_PRIVILEGE 0x1
#endif
/*
* Create a restricted token and execute the specified process with it.
*
* Returns 0 on failure, non-zero on success, same as CreateProcess().
*
* On NT4, or any other system not containing the required functions, will
* NOT execute anything.
*/
static int
CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, const char *progname)
{
BOOL b;
STARTUPINFO si;
HANDLE origToken;
HANDLE restrictedToken;
SID_IDENTIFIER_AUTHORITY NtAuthority = { SECURITY_NT_AUTHORITY };
SID_AND_ATTRIBUTES dropSids[2];
__CreateRestrictedToken _CreateRestrictedToken = NULL;
HANDLE Advapi32Handle;
ZeroMemory(&si, sizeof(si));
si.cb = sizeof(si);
Advapi32Handle = LoadLibrary("ADVAPI32.DLL");
if (Advapi32Handle != NULL)
{
_CreateRestrictedToken = (__CreateRestrictedToken)GetProcAddress(Advapi32Handle, "CreateRestrictedToken");
}
if (_CreateRestrictedToken == NULL)
{
fprintf(stderr, _("%s: WARNING: cannot create restricted tokens on this platform\n"), progname);
if (Advapi32Handle != NULL)
FreeLibrary(Advapi32Handle);
return 0;
}
/* Open the current token to use as a base for the restricted one */
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &origToken))
{
fprintf(stderr, _("%s: could not open process token: error code %lu\n"), progname, GetLastError());
return 0;
}
/* Allocate list of SIDs to remove */
ZeroMemory(&dropSids, sizeof(dropSids));
if (!AllocateAndInitializeSid(&NtAuthority, 2,
SECURITY_BUILTIN_DOMAIN_RID, DOMAIN_ALIAS_RID_ADMINS, 0, 0, 0, 0, 0,
0, &dropSids[0].Sid) ||
!AllocateAndInitializeSid(&NtAuthority, 2,
SECURITY_BUILTIN_DOMAIN_RID, DOMAIN_ALIAS_RID_POWER_USERS, 0, 0, 0, 0, 0,
0, &dropSids[1].Sid))
{
2015-05-17 22:21:36 -04:00
fprintf(stderr, _("%s: could not allocate SIDs: error code %lu\n"), progname, GetLastError());
return 0;
}
b = _CreateRestrictedToken(origToken,
DISABLE_MAX_PRIVILEGE,
sizeof(dropSids) / sizeof(dropSids[0]),
dropSids,
0, NULL,
0, NULL,
&restrictedToken);
FreeSid(dropSids[1].Sid);
FreeSid(dropSids[0].Sid);
CloseHandle(origToken);
FreeLibrary(Advapi32Handle);
if (!b)
{
fprintf(stderr, _("%s: could not create restricted token: error code %lu\n"), progname, GetLastError());
return 0;
}
#ifndef __CYGWIN__
AddUserToTokenDacl(restrictedToken);
#endif
if (!CreateProcessAsUser(restrictedToken,
NULL,
cmd,
NULL,
NULL,
TRUE,
CREATE_SUSPENDED,
NULL,
NULL,
&si,
processInfo))
{
fprintf(stderr, _("%s: could not start process for command \"%s\": error code %lu\n"), progname, cmd, GetLastError());
return 0;
}
return ResumeThread(processInfo->hThread);
}
#endif
static void
get_restricted_token(const char *progname)
{
#ifdef WIN32
/*
* Before we execute another program, make sure that we are running with a
* restricted token. If not, re-execute ourselves with one.
*/
if ((restrict_env = getenv("PG_RESTRICT_EXEC")) == NULL
|| strcmp(restrict_env, "1") != 0)
{
PROCESS_INFORMATION pi;
char *cmdline;
ZeroMemory(&pi, sizeof(pi));
cmdline = pg_strdup(GetCommandLine());
putenv("PG_RESTRICT_EXEC=1");
if (!CreateRestrictedProcess(cmdline, &pi, progname))
{
fprintf(stderr, _("%s: could not re-execute with restricted token: error code %lu\n"), progname, GetLastError());
}
else
{
/*
* Successfully re-execed. Now wait for child process to capture
* exitcode.
*/
DWORD x;
CloseHandle(pi.hThread);
WaitForSingleObject(pi.hProcess, INFINITE);
if (!GetExitCodeProcess(pi.hProcess, &x))
{
fprintf(stderr, _("%s: could not get exit code from subprocess: error code %lu\n"), progname, GetLastError());
exit(1);
}
exit(x);
}
}
#endif
}
static void
setup(char *argv0, bool *live_check)
{
char exec_path[MAXPGPATH]; /* full path to my executable */
/*
* make sure the user has a clean environment, otherwise, we may confuse
* libpq when we connect to one (or both) of the servers.
*/
check_pghost_envvar();
verify_directories();
/* no postmasters should be running, except for a live check */
if (pid_lock_file_exists(old_cluster.pgdata))
{
/*
* If we have a postmaster.pid file, try to start the server. If it
* starts, the pid file was stale, so stop the server. If it doesn't
* start, assume the server is running. If the pid file is left over
* from a server crash, this also allows any committed transactions
* stored in the WAL to be replayed so they are not lost, because WAL
* files are not transfered from old to new servers. We later check
* for a clean shutdown.
*/
if (start_postmaster(&old_cluster, false))
stop_postmaster(false);
else
{
if (!user_opts.check)
pg_fatal("There seems to be a postmaster servicing the old cluster.\n"
"Please shutdown that postmaster and try again.\n");
else
*live_check = true;
}
}
/* same goes for the new postmaster */
if (pid_lock_file_exists(new_cluster.pgdata))
{
if (start_postmaster(&new_cluster, false))
stop_postmaster(false);
else
pg_fatal("There seems to be a postmaster servicing the new cluster.\n"
"Please shutdown that postmaster and try again.\n");
}
/* get path to pg_upgrade executable */
if (find_my_exec(argv0, exec_path) < 0)
pg_fatal("Could not get path name to pg_upgrade: %s\n", getErrorText());
/* Trim off program name and keep just path */
*last_dir_separator(exec_path) = '\0';
canonicalize_path(exec_path);
os_info.exec_path = pg_strdup(exec_path);
}
static void
prepare_new_cluster(void)
{
/*
* It would make more sense to freeze after loading the schema, but that
* would cause us to lose the frozenids restored by the load. We use
* --analyze so autovacuum doesn't update statistics later
*/
prep_status("Analyzing all rows in the new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/vacuumdb\" %s --all --analyze %s",
new_cluster.bindir, cluster_conn_opts(&new_cluster),
log_opts.verbose ? "--verbose" : "");
check_ok();
/*
2010-07-06 19:19:02 +00:00
* We do freeze after analyze so pg_statistic is also frozen. template0 is
* not frozen here, but data rows were frozen by initdb, and we set its
* datfrozenxid, relfrozenxids, and relminmxid later to match the new xid
* counter later.
*/
prep_status("Freezing all rows on the new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/vacuumdb\" %s --all --freeze %s",
new_cluster.bindir, cluster_conn_opts(&new_cluster),
log_opts.verbose ? "--verbose" : "");
check_ok();
get_pg_database_relfilenode(&new_cluster);
}
static void
prepare_new_databases(void)
{
/*
* Before we restore anything, set frozenxids of initdb-created tables.
*/
set_frozenxids(false);
/*
* Now restore global objects (roles and tablespaces).
*/
prep_status("Restoring global objects in the new cluster");
/*
* Install support functions in the global-object restore database to
* preserve pg_authid.oid. pg_dumpall uses 'template0' as its template
* database so objects we add into 'template1' are not propogated. They
* are removed on pg_upgrade exit.
*/
install_support_functions_in_new_db("template1");
/*
* We have to create the databases first so we can install support
2011-04-10 11:42:00 -04:00
* functions in all the other databases. Ideally we could create the
* support functions in template1 but pg_dumpall creates database using
* the template0 template.
*/
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/psql\" " EXEC_PSQL_ARGS " %s -f \"%s\"",
new_cluster.bindir, cluster_conn_opts(&new_cluster),
GLOBALS_DUMP_FILE);
check_ok();
/* we load this to get a current list of databases */
get_db_and_rel_infos(&new_cluster);
}
static void
create_new_objects(void)
{
int dbnum;
prep_status("Adding support functions to new cluster");
/*
* Technically, we only need to install these support functions in new
* databases that also exist in the old cluster, but for completeness we
* process all new databases.
*/
for (dbnum = 0; dbnum < new_cluster.dbarr.ndbs; dbnum++)
{
DbInfo *new_db = &new_cluster.dbarr.dbs[dbnum];
/* skip db we already installed */
if (strcmp(new_db->db_name, "template1") != 0)
install_support_functions_in_new_db(new_db->db_name);
}
check_ok();
prep_status("Restoring database schemas in the new cluster\n");
for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++)
{
char sql_file_name[MAXPGPATH],
log_file_name[MAXPGPATH];
DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum];
PQExpBufferData connstr,
escaped_connstr;
initPQExpBuffer(&connstr);
appendPQExpBuffer(&connstr, "dbname=");
appendConnStrVal(&connstr, old_db->db_name);
initPQExpBuffer(&escaped_connstr);
appendShellString(&escaped_connstr, connstr.data);
termPQExpBuffer(&connstr);
pg_log(PG_STATUS, "%s", old_db->db_name);
snprintf(sql_file_name, sizeof(sql_file_name), DB_DUMP_FILE_MASK, old_db->db_oid);
snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid);
/*
* pg_dump only produces its output at the end, so there is little
* parallelism if using the pipe.
*/
parallel_exec_prog(log_file_name,
NULL,
"\"%s/pg_restore\" %s --exit-on-error --verbose --dbname %s \"%s\"",
new_cluster.bindir,
cluster_conn_opts(&new_cluster),
escaped_connstr.data,
sql_file_name);
termPQExpBuffer(&escaped_connstr);
}
/* reap all children */
while (reap_child(true) == true)
;
end_progress_output();
check_ok();
/*
* We don't have minmxids for databases or relations in pre-9.3
Fix pg_upgrade to not fail when new-cluster TOAST rules differ from old. This patch essentially reverts commit 4c6780fd17aa43ed, in favor of a much simpler solution for the case where the new cluster would choose to create a TOAST table but the old cluster doesn't have one: just don't create a TOAST table. The existing code failed in at least two different ways if the situation arose: (1) ALTER TABLE RESET didn't grab an exclusive lock, so that the lock sanity check in create_toast_table failed; (2) pg_upgrade did not provide a pg_type OID for the new toast table, so that the crosscheck in TypeCreate failed. While both these problems were introduced by later patches, they show that the hack being used to cause TOAST table creation is overwhelmingly fragile (and untested). I also note that before the TypeCreate crosscheck was added, the code would have resulted in assigning an indeterminate pg_type OID to the toast table, possibly causing a later OID conflict in that catalog; so that it didn't really work even when committed. If we simply don't create a TOAST table, there will only be a problem if the code tries to store a tuple that's wider than a page, and field compression isn't sufficient to get it under a page. Given that the TOAST creation threshold is intended to be about a quarter of a page, it's very hard to believe that cross-version differences in the do-we-need-a-toast- table heuristic could result in an observable problem. So let's just follow the old version's conclusion about whether a TOAST table is needed. (If we ever do change needs_toast_table() so much that this conclusion doesn't apply, we can devise a solution at that time, and hopefully do it in a less klugy way than 4c6780fd17aa43ed did.) Back-patch to 9.3, like the previous patch. Discussion: <8110.1462291671@sss.pgh.pa.us>
2016-05-06 22:05:51 -04:00
* clusters, so set those after we have restored the schema.
*/
if (GET_MAJOR_VERSION(old_cluster.major_version) < 903)
set_frozenxids(true);
/* regenerate now that we have objects in the databases */
get_db_and_rel_infos(&new_cluster);
uninstall_support_functions_from_new_cluster();
}
/*
* Delete the given subdirectory contents from the new cluster
*/
static void
remove_new_subdir(char *subdir, bool rmtopdir)
{
char new_path[MAXPGPATH];
prep_status("Deleting files from new %s", subdir);
snprintf(new_path, sizeof(new_path), "%s/%s", new_cluster.pgdata, subdir);
if (!rmtree(new_path, rmtopdir))
pg_fatal("could not delete directory \"%s\"\n", new_path);
check_ok();
}
/*
* Copy the files from the old cluster into it
*/
static void
copy_subdir_files(char *subdir)
{
char old_path[MAXPGPATH];
char new_path[MAXPGPATH];
remove_new_subdir(subdir, true);
snprintf(old_path, sizeof(old_path), "%s/%s", old_cluster.pgdata, subdir);
snprintf(new_path, sizeof(new_path), "%s/%s", new_cluster.pgdata, subdir);
prep_status("Copying old %s to new server", subdir);
exec_prog(UTILITY_LOG_FILE, NULL, true,
#ifndef WIN32
"cp -Rf \"%s\" \"%s\"",
#else
/* flags: everything, no confirm, quiet, overwrite read-only */
"xcopy /e /y /q /r \"%s\" \"%s\\\"",
#endif
old_path, new_path);
check_ok();
}
static void
copy_clog_xlog_xid(void)
{
/* copy old commit logs to new data dir */
copy_subdir_files("pg_clog");
/* set the next transaction id and epoch of the new cluster */
prep_status("Setting next transaction ID and epoch for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/pg_resetxlog\" -f -x %u \"%s\"",
new_cluster.bindir, old_cluster.controldata.chkpnt_nxtxid,
new_cluster.pgdata);
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/pg_resetxlog\" -f -e %u \"%s\"",
new_cluster.bindir, old_cluster.controldata.chkpnt_nxtepoch,
new_cluster.pgdata);
check_ok();
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 12:04:59 -03:00
/*
* If the old server is before the MULTIXACT_FORMATCHANGE_CAT_VER change
* (see pg_upgrade.h) and the new server is after, then we don't copy
* pg_multixact files, but we need to reset pg_control so that the new
* server doesn't attempt to read multis older than the cutoff value.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 12:04:59 -03:00
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
{
copy_subdir_files("pg_multixact/offsets");
copy_subdir_files("pg_multixact/members");
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 12:04:59 -03:00
prep_status("Setting next multixact ID and offset for new cluster");
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 12:04:59 -03:00
/*
* we preserve all files and contents, so we must preserve both "next"
* counters here and the oldest multi present on system.
*/
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/pg_resetxlog\" -O %u -m %u,%u \"%s\"",
new_cluster.bindir,
old_cluster.controldata.chkpnt_nxtmxoff,
old_cluster.controldata.chkpnt_nxtmulti,
old_cluster.controldata.chkpnt_oldstMulti,
new_cluster.pgdata);
check_ok();
}
else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
* the new multi-xid value. "members" starts at zero so no need to
* remove it.
*/
remove_new_subdir("pg_multixact/offsets", false);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 12:04:59 -03:00
prep_status("Setting oldest multixact ID on new cluster");
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 12:04:59 -03:00
/*
* We don't preserve files in this case, but it's important that the
* oldest multi is set to the latest value used by the old system, so
* that multixact.c returns the empty set for multis that might be
* present on disk. We set next multi to the value following that; it
* might end up wrapped around (i.e. 0) if the old cluster had
* next=MaxMultiXactId, but multixact.c can cope with that just fine.
*/
exec_prog(UTILITY_LOG_FILE, NULL, true,
"\"%s/pg_resetxlog\" -m %u,%u \"%s\"",
new_cluster.bindir,
old_cluster.controldata.chkpnt_nxtmulti + 1,
old_cluster.controldata.chkpnt_nxtmulti,
new_cluster.pgdata);
check_ok();
}
/* now reset the wal archives in the new cluster */
prep_status("Resetting WAL archives");
exec_prog(UTILITY_LOG_FILE, NULL, true,
/* use timeline 1 to match controldata and no WAL history file */
"\"%s/pg_resetxlog\" -l 00000001%s \"%s\"", new_cluster.bindir,
old_cluster.controldata.nextxlogfile + 8,
new_cluster.pgdata);
check_ok();
}
/*
* set_frozenxids()
*
* This is called on the new cluster before we restore anything, with
* minmxid_only = false. Its purpose is to ensure that all initdb-created
* vacuumable tables have relfrozenxid/relminmxid matching the old cluster's
* xid/mxid counters. We also initialize the datfrozenxid/datminmxid of the
* built-in databases to match.
*
* As we create user tables later, their relfrozenxid/relminmxid fields will
* be restored properly by the binary-upgrade restore script. Likewise for
* user-database datfrozenxid/datminmxid. However, if we're upgrading from a
* pre-9.3 database, which does not store per-table or per-DB minmxid, then
* the relminmxid/datminmxid values filled in by the restore script will just
* be zeroes.
*
* Hence, with a pre-9.3 source database, a second call occurs after
* everything is restored, with minmxid_only = true. This pass will
* initialize all tables and databases, both those made by initdb and user
* objects, with the desired minmxid value. frozenxid values are left alone.
*/
static void
set_frozenxids(bool minmxid_only)
{
int dbnum;
2010-07-06 19:19:02 +00:00
PGconn *conn,
*conn_template1;
PGresult *dbres;
int ntups;
int i_datname;
int i_datallowconn;
if (!minmxid_only)
prep_status("Setting frozenxid and minmxid counters in new cluster");
else
prep_status("Setting minmxid counter in new cluster");
conn_template1 = connectToServer(&new_cluster, "template1");
if (!minmxid_only)
/* set pg_database.datfrozenxid */
PQclear(executeQueryOrDie(conn_template1,
"UPDATE pg_catalog.pg_database "
"SET datfrozenxid = '%u'",
old_cluster.controldata.chkpnt_nxtxid));
/* set pg_database.datminmxid */
PQclear(executeQueryOrDie(conn_template1,
"UPDATE pg_catalog.pg_database "
"SET datminmxid = '%u'",
old_cluster.controldata.chkpnt_nxtmulti));
/* get database names */
dbres = executeQueryOrDie(conn_template1,
"SELECT datname, datallowconn "
"FROM pg_catalog.pg_database");
i_datname = PQfnumber(dbres, "datname");
i_datallowconn = PQfnumber(dbres, "datallowconn");
ntups = PQntuples(dbres);
for (dbnum = 0; dbnum < ntups; dbnum++)
{
2010-07-06 19:19:02 +00:00
char *datname = PQgetvalue(dbres, dbnum, i_datname);
char *datallowconn = PQgetvalue(dbres, dbnum, i_datallowconn);
/*
2010-07-06 19:19:02 +00:00
* We must update databases where datallowconn = false, e.g.
* template0, because autovacuum increments their datfrozenxids,
* relfrozenxids, and relminmxid even if autovacuum is turned off,
* and even though all the data rows are already frozen To enable
* this, we temporarily change datallowconn.
*/
if (strcmp(datallowconn, "f") == 0)
PQclear(executeQueryOrDie(conn_template1,
2010-07-06 19:19:02 +00:00
"UPDATE pg_catalog.pg_database "
"SET datallowconn = true "
"WHERE datname = '%s'", datname));
conn = connectToServer(&new_cluster, datname);
if (!minmxid_only)
/* set pg_class.relfrozenxid */
PQclear(executeQueryOrDie(conn,
"UPDATE pg_catalog.pg_class "
"SET relfrozenxid = '%u' "
/* only heap, materialized view, and TOAST are vacuumed */
"WHERE relkind IN ('r', 'm', 't')",
old_cluster.controldata.chkpnt_nxtxid));
/* set pg_class.relminmxid */
PQclear(executeQueryOrDie(conn,
"UPDATE pg_catalog.pg_class "
"SET relminmxid = '%u' "
/* only heap, materialized view, and TOAST are vacuumed */
"WHERE relkind IN ('r', 'm', 't')",
old_cluster.controldata.chkpnt_nxtmulti));
PQfinish(conn);
/* Reset datallowconn flag */
if (strcmp(datallowconn, "f") == 0)
PQclear(executeQueryOrDie(conn_template1,
2010-07-06 19:19:02 +00:00
"UPDATE pg_catalog.pg_database "
"SET datallowconn = false "
"WHERE datname = '%s'", datname));
}
PQclear(dbres);
PQfinish(conn_template1);
check_ok();
}
static void
cleanup(void)
{
fclose(log_opts.internal);
/* Remove dump and log files? */
if (!log_opts.retain)
{
int dbnum;
char **filename;
for (filename = output_files; *filename != NULL; filename++)
unlink(*filename);
/* remove dump files */
unlink(GLOBALS_DUMP_FILE);
if (old_cluster.dbarr.dbs)
for (dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++)
{
char sql_file_name[MAXPGPATH],
log_file_name[MAXPGPATH];
DbInfo *old_db = &old_cluster.dbarr.dbs[dbnum];
snprintf(sql_file_name, sizeof(sql_file_name), DB_DUMP_FILE_MASK, old_db->db_oid);
unlink(sql_file_name);
snprintf(log_file_name, sizeof(log_file_name), DB_DUMP_LOG_FILE_MASK, old_db->db_oid);
unlink(log_file_name);
}
}
}