OSDN Git Service

Further improvements in pg_ctl's new wait-for-postmaster-start logic.
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 1 Jun 2011 17:09:07 +0000 (13:09 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 1 Jun 2011 17:09:07 +0000 (13:09 -0400)
Add a postmaster_is_alive() test to the wait loop, so that we stop waiting
if the postmaster dies without removing its pidfile.  Unfortunately this
only helps after the postmaster has created its pidfile, since until then
we don't know which PID to check.  But if it never does create the pidfile,
we can give up in a relatively short time, so this is a useful addition
in practice.  Per suggestion from Fujii Masao, though this doesn't look
very much like his patch.

In addition, improve pg_ctl's ability to cope with pre-existing pidfiles.
Such a file might or might not represent a live postmaster that is going to
block our postmaster from starting, but the previous code pre-judged the
situation and gave up waiting immediately.  Now, we will wait for up to 5
seconds to see if our postmaster overwrites such a file.  This issue
interacts with Fujii's patch because we would make the wrong conclusion
if we did the postmaster_is_alive() test with a pre-existing PID.

All of this could be improved if we rewrote start_postmaster() so that it
could report the child postmaster's PID, so that we'd know a-priori the
correct PID to test with postmaster_is_alive().  That looks like a bit too
much change for so late in the 9.1 development cycle, unfortunately.

src/bin/pg_ctl/pg_ctl.c

index 7af3b91..c4b8f15 100644 (file)
@@ -366,6 +366,10 @@ start_postmaster(void)
        /*
         * Since there might be quotes to handle here, it is easier simply to pass
         * everything to a shell to process them.
+        *
+        * XXX it would be better to fork and exec so that we would know the
+        * child postmaster's PID directly; then test_postmaster_connection could
+        * use the PID without having to rely on reading it back from the pidfile.
         */
        if (log_file != NULL)
                snprintf(cmd, MAXPGPATH, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE,
@@ -412,6 +416,8 @@ static PGPing
 test_postmaster_connection(bool do_checkpoint)
 {
        PGPing          ret = PQPING_NO_RESPONSE;
+       bool            found_stale_pidfile = false;
+       pgpid_t         pm_pid = 0;
        char            connstr[MAXPGPATH * 2 + 256];
        int                     i;
 
@@ -458,7 +464,7 @@ test_postmaster_connection(bool do_checkpoint)
                                if (optlines[3] == NULL)
                                {
                                        /* File is exactly three lines, must be pre-9.1 */
-                                       write_stderr(_("%s: -w option is not supported when starting a pre-9.1 server\n"),
+                                       write_stderr(_("\n%s: -w option is not supported when starting a pre-9.1 server\n"),
                                                                 progname);
                                        return PQPING_NO_ATTEMPT;
                                }
@@ -466,70 +472,86 @@ test_postmaster_connection(bool do_checkpoint)
                                                 optlines[5] != NULL)
                                {
                                        /* File is complete enough for us, parse it */
+                                       long            pmpid;
                                        time_t          pmstart;
-                                       int                     portnum;
-                                       char       *sockdir;
-                                       char       *hostaddr;
-                                       char            host_str[MAXPGPATH];
 
                                        /*
-                                        * Easy cross-check that we are looking at the right data
-                                        * directory: is the postmaster older than this execution
-                                        * of pg_ctl?  Subtract 2 seconds to allow for possible
-                                        * clock skew between pg_ctl and the postmaster.
+                                        * Make sanity checks.  If it's for a standalone backend
+                                        * (negative PID), or the recorded start time is before
+                                        * pg_ctl started, then either we are looking at the wrong
+                                        * data directory, or this is a pre-existing pidfile that
+                                        * hasn't (yet?) been overwritten by our child postmaster.
+                                        * Allow 2 seconds slop for possible cross-process clock
+                                        * skew.
                                         */
+                                       pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]);
                                        pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]);
-                                       if (pmstart < start_time - 2)
+                                       if (pmpid <= 0 || pmstart < start_time - 2)
                                        {
-                                               write_stderr(_("%s: this data directory is running a pre-existing postmaster\n"),
-                                                                        progname);
-                                               return PQPING_NO_ATTEMPT;
+                                               /*
+                                                * Set flag to report stale pidfile if it doesn't
+                                                * get overwritten before we give up waiting.
+                                                */
+                                               found_stale_pidfile = true;
                                        }
+                                       else
+                                       {
+                                               /*
+                                                * OK, seems to be a valid pidfile from our child.
+                                                */
+                                               int                     portnum;
+                                               char       *sockdir;
+                                               char       *hostaddr;
+                                               char            host_str[MAXPGPATH];
 
-                                       /*
-                                        * OK, extract port number and host string to use. Prefer
-                                        * using Unix socket if available.
-                                        */
-                                       portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]);
+                                               found_stale_pidfile = false;
+                                               pm_pid = (pgpid_t) pmpid;
 
-                                       sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1];
-                                       hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1];
+                                               /*
+                                                * Extract port number and host string to use. Prefer
+                                                * using Unix socket if available.
+                                                */
+                                               portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]);
+                                               sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1];
+                                               hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1];
 
-                                       /*
-                                        * While unix_socket_directory can accept relative
-                                        * directories, libpq's host parameter must have a leading
-                                        * slash to indicate a socket directory.  So, ignore
-                                        * sockdir if it's relative, and try to use TCP instead.
-                                        */
-                                       if (sockdir[0] == '/')
-                                               strlcpy(host_str, sockdir, sizeof(host_str));
-                                       else
-                                               strlcpy(host_str, hostaddr, sizeof(host_str));
+                                               /*
+                                                * While unix_socket_directory can accept relative
+                                                * directories, libpq's host parameter must have a
+                                                * leading slash to indicate a socket directory.  So,
+                                                * ignore sockdir if it's relative, and try to use TCP
+                                                * instead.
+                                                */
+                                               if (sockdir[0] == '/')
+                                                       strlcpy(host_str, sockdir, sizeof(host_str));
+                                               else
+                                                       strlcpy(host_str, hostaddr, sizeof(host_str));
 
-                                       /* remove trailing newline */
-                                       if (strchr(host_str, '\n') != NULL)
-                                               *strchr(host_str, '\n') = '\0';
+                                               /* remove trailing newline */
+                                               if (strchr(host_str, '\n') != NULL)
+                                                       *strchr(host_str, '\n') = '\0';
 
-                                       /* Fail if we couldn't get either sockdir or host addr */
-                                       if (host_str[0] == '\0')
-                                       {
-                                               write_stderr(_("%s: -w option cannot use a relative socket directory specification\n"),
-                                                                        progname);
-                                               return PQPING_NO_ATTEMPT;
-                                       }
+                                               /* Fail if couldn't get either sockdir or host addr */
+                                               if (host_str[0] == '\0')
+                                               {
+                                                       write_stderr(_("\n%s: -w option cannot use a relative socket directory specification\n"),
+                                                                                progname);
+                                                       return PQPING_NO_ATTEMPT;
+                                               }
 
-                                       /* If postmaster is listening on "*", use "localhost" */
-                                       if (strcmp(host_str, "*") == 0)
-                                               strcpy(host_str, "localhost");
+                                               /* If postmaster is listening on "*", use localhost */
+                                               if (strcmp(host_str, "*") == 0)
+                                                       strcpy(host_str, "localhost");
 
-                                       /*
-                                        * We need to set connect_timeout otherwise on Windows the
-                                        * Service Control Manager (SCM) will probably timeout
-                                        * first.
-                                        */
-                                       snprintf(connstr, sizeof(connstr),
-                                          "dbname=postgres port=%d host='%s' connect_timeout=5",
-                                                        portnum, host_str);
+                                               /*
+                                                * We need to set connect_timeout otherwise on Windows
+                                                * the Service Control Manager (SCM) will probably
+                                                * timeout first.
+                                                */
+                                               snprintf(connstr, sizeof(connstr),
+                                                                "dbname=postgres port=%d host='%s' connect_timeout=5",
+                                                                portnum, host_str);
+                                       }
                                }
                        }
                }
@@ -545,7 +567,11 @@ test_postmaster_connection(bool do_checkpoint)
                /*
                 * The postmaster should create postmaster.pid very soon after being
                 * started.  If it's not there after we've waited 5 or more seconds,
-                * assume startup failed and give up waiting.
+                * assume startup failed and give up waiting.  (Note this covers
+                * both cases where the pidfile was never created, and where it was
+                * created and then removed during postmaster exit.)  Also, if there
+                * *is* a file there but it appears stale, issue a suitable warning
+                * and give up waiting.
                 */
                if (i >= 5)
                {
@@ -553,8 +579,24 @@ test_postmaster_connection(bool do_checkpoint)
 
                        if (stat(pid_file, &statbuf) != 0)
                                return PQPING_NO_RESPONSE;
+
+                       if (found_stale_pidfile)
+                       {
+                               write_stderr(_("\n%s: this data directory appears to be running a pre-existing postmaster\n"),
+                                                        progname);
+                               return PQPING_NO_RESPONSE;
+                       }
                }
 
+               /*
+                * If we've been able to identify the child postmaster's PID, check
+                * the process is still alive.  This covers cases where the postmaster
+                * successfully created the pidfile but then crashed without removing
+                * it.
+                */
+               if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid))
+                       return PQPING_NO_RESPONSE;
+
                /* No response, or startup still in process; wait */
 #if defined(WIN32)
                if (do_checkpoint)
@@ -715,7 +757,6 @@ do_init(void)
 static void
 do_start(void)
 {
-       pgpid_t         pid;
        pgpid_t         old_pid = 0;
        int                     exitcode;
 
@@ -765,19 +806,6 @@ do_start(void)
                exit(1);
        }
 
-       if (old_pid != 0)
-       {
-               pg_usleep(1000000);
-               pid = get_pgpid();
-               if (pid == old_pid)
-               {
-                       write_stderr(_("%s: could not start server\n"
-                                                  "Examine the log output.\n"),
-                                                progname);
-                       exit(1);
-               }
-       }
-
        if (do_wait)
        {
                print_msg(_("waiting for server to start..."));