Immediately read screen process

code-asher · code-asher · commit 7a8ec2e582f0 · 2023-08-09T17:43:47.000-08:00
It is possible for the process to immediately exit (for example if you
run `echo hello`), then we would wait for the `version` command to
succeed but it never will because the session is already gone.

If we immediately read to the process then we can tell when it has gone
away and we can abort.
diff --git a/agent/agent_test.go b/agent/agent_test.go
@@ -1720,6 +1720,18 @@ func TestAgent_ReconnectingPTY(t *testing.T) {
 				line := scanner3.Text()
 				t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, ""))
 			}
+
+			// Try a non-shell command.  It should output then immediately exit.
+			netConn4, err := conn.ReconnectingPTY(ctx, uuid.New(), 100, 100, "echo test")
+			require.NoError(t, err)
+			defer netConn4.Close()
+
+			scanner4 := bufio.NewScanner(netConn4)
+			require.True(t, hasLine(scanner4, matchEchoOutput), "find exit command")
+			for scanner4.Scan() {
+				line := scanner4.Text()
+				t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, ""))
+			}
 		})
 	}
 }
diff --git a/agent/reconnectingpty/buffered.go b/agent/reconnectingpty/buffered.go
@@ -94,6 +94,8 @@ func newBuffered(ctx context.Context, cmd *pty.Cmd, options *Options, logger slo
 				}
 				// Could have been killed externally or failed to start at all (command
 				// not found for example).
+				// TODO: Should we check the process's exit code in case the command was
+				//       invalid?
 				rpty.Close("unable to read pty output, command might have exited")
 				break
 			}
diff --git a/agent/reconnectingpty/screen.go b/agent/reconnectingpty/screen.go
@@ -162,8 +162,15 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne
 
 	go heartbeat(ctx, rpty.timer, rpty.timeout)
 
-	ptty, process, err := rpty.doAttach(ctx, height, width, logger)
+	ptty, process, err := rpty.doAttach(ctx, conn, height, width, logger)
 	if err != nil {
+		if errors.Is(err, context.Canceled) {
+			// Likely the process was too short-lived and canceled the version command.
+			// TODO: Is it worth distinguishing between that and a cancel from the
+			//       Attach() caller?  Additionally, since this could also happen if
+			//       the command was invalid, should we check the process's exit code?
+			return nil
+		}
 		return err
 	}
 
@@ -180,53 +187,6 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne
 		}
 	}()
 
-	// Pipe pty -> conn.
-	// We do not need to separately monitor for the process exiting.  When it
-	// exits, our ptty.OutputReader() will return EOF after reading all process
-	// output.
-	go func() {
-		// Close the connection when the process exits.  Log only for debugging
-		// since the connection might have already closed on its own.
-		defer func() {
-			err := conn.Close()
-			if err != nil {
-				logger.Debug(ctx, "closed connection with error", slog.Error(err))
-			}
-		}()
-		buffer := make([]byte, 1024)
-		for {
-			read, err := ptty.OutputReader().Read(buffer)
-			if err != nil {
-				// When the PTY is closed, this is triggered.
-				// Error is typically a benign EOF, so only log for debugging.
-				if errors.Is(err, io.EOF) {
-					logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err))
-				} else {
-					logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err))
-					rpty.metrics.WithLabelValues("screen_output_reader").Add(1)
-				}
-				// The process might have died because the session itself died or it
-				// might have been separately killed and the session is still up (for
-				// example `exit` or we killed it when the connection closed).  If the
-				// session is still up we might leave the reconnecting pty in memory
-				// around longer than it needs to be but it will eventually clean up
-				// with the timer or context, or the next attach will respawn the screen
-				// daemon which is fine too.
-				break
-			}
-			part := buffer[:read]
-			_, err = conn.Write(part)
-			if err != nil {
-				// Connection might have been closed.
-				if errors.Unwrap(err).Error() != "endpoint is closed for send" {
-					logger.Warn(ctx, "error writing to active conn", slog.Error(err))
-					rpty.metrics.WithLabelValues("screen_write").Add(1)
-				}
-				break
-			}
-		}
-	}()
-
 	// Pipe conn -> pty and block.
 	readConnLoop(ctx, conn, ptty, rpty.metrics, logger)
 	return nil
@@ -235,7 +195,7 @@ func (rpty *screenReconnectingPTY) Attach(ctx context.Context, _ string, conn ne
 // doAttach spawns the screen client and starts the heartbeat.  It exists
 // separately only so we can defer the mutex unlock which is not possible in
 // Attach since it blocks.
-func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) {
+func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, conn net.Conn, height, width uint16, logger slog.Logger) (pty.PTYCmd, pty.Process, error) {
 	// Ensure another attach does not come in and spawn a duplicate session.
 	rpty.mutex.Lock()
 	defer rpty.mutex.Unlock()
@@ -273,12 +233,65 @@ func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width u
 		return nil, nil, err
 	}
 
+	// This context lets us abort the version command if the process dies.
+	versionCtx, versionCancel := context.WithCancel(ctx)
+	defer versionCancel()
+
+	// Pipe pty -> conn and close the connection when the process exits.
+	// We do not need to separately monitor for the process exiting.  When it
+	// exits, our ptty.OutputReader() will return EOF after reading all process
+	// output.
+	go func() {
+		defer versionCancel()
+		defer func() {
+			err := conn.Close()
+			if err != nil {
+				// Log only for debugging since the connection might have already closed
+				// on its own.
+				logger.Debug(ctx, "closed connection with error", slog.Error(err))
+			}
+		}()
+		buffer := make([]byte, 1024)
+		for {
+			read, err := ptty.OutputReader().Read(buffer)
+			if err != nil {
+				// When the PTY is closed, this is triggered.
+				// Error is typically a benign EOF, so only log for debugging.
+				if errors.Is(err, io.EOF) {
+					logger.Debug(ctx, "unable to read pty output; screen might have exited", slog.Error(err))
+				} else {
+					logger.Warn(ctx, "unable to read pty output; screen might have exited", slog.Error(err))
+					rpty.metrics.WithLabelValues("screen_output_reader").Add(1)
+				}
+				// The process might have died because the session itself died or it
+				// might have been separately killed and the session is still up (for
+				// example `exit` or we killed it when the connection closed).  If the
+				// session is still up we might leave the reconnecting pty in memory
+				// around longer than it needs to be but it will eventually clean up
+				// with the timer or context, or the next attach will respawn the screen
+				// daemon which is fine too.
+				break
+			}
+			part := buffer[:read]
+			_, err = conn.Write(part)
+			if err != nil {
+				// Connection might have been closed.
+				if errors.Unwrap(err).Error() != "endpoint is closed for send" {
+					logger.Warn(ctx, "error writing to active conn", slog.Error(err))
+					rpty.metrics.WithLabelValues("screen_write").Add(1)
+				}
+				break
+			}
+		}
+	}()
+
 	// Version seems to be the only command without a side effect (other than
 	// making the version pop up briefly) so use it to wait for the session to
 	// come up.  If we do not wait we could end up spawning multiple sessions with
 	// the same name.
-	err = rpty.sendCommand(ctx, "version", nil)
+	err = rpty.sendCommand(versionCtx, "version", nil)
 	if err != nil {
+		// Log only for debugging since the process might already have closed.
 		closeErr := ptty.Close()
 		if closeErr != nil {
 			logger.Debug(ctx, "closed ptty with error", slog.Error(closeErr))
@@ -298,8 +311,9 @@ func (rpty *screenReconnectingPTY) doAttach(ctx context.Context, height, width u
 // command fails with an error matching anything in successErrors it will be
 // considered a success state (for example "no session" when quitting and the
 // session is already dead).  The command will be retried until successful, the
-// timeout is reached, or the context ends in which case the context error is
-// returned together with the last error from the command.
+// timeout is reached, or the context ends.  A canceled context will return the
+// canceled context's error as-is while a timed-out context returns together
+// with the last error from the command.
 func (rpty *screenReconnectingPTY) sendCommand(ctx context.Context, command string, successErrors []string) error {
 	ctx, cancel := context.WithTimeout(ctx, attachTimeout)
 	defer cancel()
@@ -352,6 +366,9 @@ func (rpty *screenReconnectingPTY) sendCommand(ctx context.Context, command stri
 	for {
 		select {
 		case <-ctx.Done():
+			if errors.Is(ctx.Err(), context.Canceled) {
+				return ctx.Err()
+			}
 			return errors.Join(ctx.Err(), lastErr)
 		case <-ticker.C:
 			if done := run(); done {

Original file line number	Diff line number	Diff line change
`@@ -1720,6 +1720,18 @@ func TestAgent_ReconnectingPTY(t *testing.T) {`
`1720`	`1720`	`line := scanner3.Text()`
`1721`	`1721`	`t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, ""))`
`1722`	`1722`	`}`
	`1723`	`+`
	`1724`	`+ // Try a non-shell command. It should output then immediately exit.`
	`1725`	`+ netConn4, err := conn.ReconnectingPTY(ctx, uuid.New(), 100, 100, "echo test")`
	`1726`	`+ require.NoError(t, err)`
	`1727`	`+ defer netConn4.Close()`
	`1728`	`+`
	`1729`	`+ scanner4 := bufio.NewScanner(netConn4)`
	`1730`	`+ require.True(t, hasLine(scanner4, matchEchoOutput), "find exit command")`
	`1731`	`+ for scanner4.Scan() {`
	`1732`	`+ line := scanner4.Text()`
	`1733`	`+ t.Logf("bash tty stdout = %s", re.ReplaceAllString(line, ""))`
	`1734`	`+ }`
`1723`	`1735`	`})`
`1724`	`1736`	`}`
`1725`	`1737`	`}`
Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,8 @@ func newBuffered(ctx context.Context, cmd pty.Cmd, options Options, logger slo`
`94`	`94`	`}`
`95`	`95`	`// Could have been killed externally or failed to start at all (command`
`96`	`96`	`// not found for example).`
	`97`	`+ // TODO: Should we check the process's exit code in case the command was`
	`98`	`+ // invalid?`
`97`	`99`	`rpty.Close("unable to read pty output, command might have exited")`
`98`	`100`	`break`
`99`	`101`	`}`