Skip to content

chore: log provider stack traces on text file busy #15249

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion provisioner/terraform/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
return version.NewVersion(vj.Version)
}

type textFileBusyError struct {
exitErr *exec.ExitError
stderr string
}

func (e *textFileBusyError) Error() string {
return "text file busy: " + e.exitErr.String()
}

func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
ctx, span := e.server.startTrace(ctx, tracing.FuncName())
defer span.End()
Expand All @@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
<-doneErr
}()

// As a special case, we want to look for the error "text file busy" in the stderr output of
// the init command, so we also take a copy of the stderr into an in memory buffer.
errBuf := newBufferedWriteCloser(errWriter)

args := []string{
"init",
"-no-color",
"-input=false",
}

return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf)
var exitErr *exec.ExitError
if xerrors.As(err, &exitErr) {
if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
return &textFileBusyError{exitErr: exitErr, stderr: errBuf.b.String()}
}
}
return err
}

func getPlanFilePath(workdir string) string {
Expand Down Expand Up @@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
defer sw.mut.Unlock()
return sw.w.Write(p)
}

type bufferedWriteCloser struct {
wc io.WriteCloser
b bytes.Buffer
}

func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser {
return &bufferedWriteCloser{
wc: wc,
}
}

func (b *bufferedWriteCloser) Write(p []byte) (int, error) {
n, err := b.b.Write(p)
if err != nil {
return n, err
}
return b.wc.Write(p)
}

func (b *bufferedWriteCloser) Close() error {
return b.wc.Close()
}
62 changes: 62 additions & 0 deletions provisioner/terraform/provision.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ import (
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"os"
"path/filepath"
"strings"
"time"

Expand Down Expand Up @@ -109,10 +113,32 @@ func (s *server) Plan(
initTimings.ingest(createInitTimingsEvent(timingInitStart))

err = e.init(ctx, killCtx, sess)

if err != nil {
initTimings.ingest(createInitTimingsEvent(timingInitErrored))

s.logger.Debug(ctx, "init failed", slog.Error(err))

// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
// We believe this might be due to some race condition that prevents the
// terraform-provider-coder process from exiting. When terraform tries to install the
// provider during this init, it copies over the local cache. Normally this isn't an issue,
// but if the terraform-provider-coder process is still running from a previous build, Linux
// returns "text file busy" error when attempting to open the file.
//
// Capturing the stack trace from the process should help us figure out why it has not
// exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to
// notice, and also because it indicates this provisioner could be permanently broken and
// require a restart.
var errTFB *textFileBusyError
if xerrors.As(err, &errTFB) {
stacktrace := tryGettingCoderProviderStacktrace(sess)
s.logger.Critical(ctx, "init: text file busy",
slog.Error(errTFB),
slog.F("stderr", errTFB.stderr),
slog.F("provider_coder_stacktrace", stacktrace),
)
}
return provisionersdk.PlanErrorf("initialize terraform: %s", err)
}

Expand Down Expand Up @@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) {
}
}
}

// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us
// to figure out why it hasn't exited.
func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string {
path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof"))
sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path))
c := http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
d := net.Dialer{}
return d.DialContext(ctx, "unix", path)
},
},
}
req, err := http.NewRequestWithContext(sess.Context(), http.MethodGet,
"http://localhost/debug/pprof/goroutine?debug=2", nil)
if err != nil {
sess.Logger.Error(sess.Context(), "error creating GET request", slog.Error(err))
return ""
}
resp, err := c.Do(req)
if err != nil {
// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
// in v1.0.4
sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err))
return ""
}
defer resp.Body.Close()
stacktraces, err := io.ReadAll(resp.Body)
if err != nil {
sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err))
}
return string(stacktraces)
}
93 changes: 89 additions & 4 deletions provisioner/terraform/provision_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"encoding/json"
"errors"
"fmt"
"net"
"net/http"
"os"
"path/filepath"
"runtime"
Expand All @@ -31,14 +33,22 @@ import (
type provisionerServeOptions struct {
binaryPath string
exitTimeout time.Duration
workDir string
logger *slog.Logger
}

func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
if opts == nil {
opts = &provisionerServeOptions{}
}
cachePath := t.TempDir()
workDir := t.TempDir()
if opts.workDir == "" {
opts.workDir = t.TempDir()
}
if opts.logger == nil {
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
opts.logger = &logger
}
client, server := drpc.MemTransportPipe()
ctx, cancelFunc := context.WithCancel(context.Background())
serverErr := make(chan error, 1)
Expand All @@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
ServeOptions: &provisionersdk.ServeOptions{
Listener: server,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
WorkDirectory: workDir,
Logger: *opts.logger,
WorkDirectory: opts.workDir,
},
BinaryPath: opts.binaryPath,
CachePath: cachePath,
Expand Down Expand Up @@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
dir := t.TempDir()
binPath := filepath.Join(dir, "terraform")

// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
require.NoError(t, err)
Expand Down Expand Up @@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) {
}
}

// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
// terraform-provider-coder
// nolint: paralleltest
func TestProvision_TextFileBusy(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("This test uses unix sockets and is not supported on Windows")
}

cwd, err := os.Getwd()
require.NoError(t, err)
fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh")

dir := t.TempDir()
binPath := filepath.Join(dir, "terraform")

// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
require.NoError(t, err)

workDir := t.TempDir()

err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700)
require.NoError(t, err)
l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof"))
require.NoError(t, err)
defer l.Close()
handlerCalled := 0
// nolint: gosec
srv := &http.Server{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path)
w.WriteHeader(http.StatusOK)
_, err := w.Write([]byte("thestacks\n"))
assert.NoError(t, err)
handlerCalled++
}),
}
srvErr := make(chan error, 1)
go func() {
srvErr <- srv.Serve(l)
}()

logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
ctx, api := setupProvisioner(t, &provisionerServeOptions{
binaryPath: binPath,
exitTimeout: time.Second,
workDir: workDir,
logger: &logger,
})

sess := configure(ctx, t, api, &proto.Config{
TemplateSourceArchive: makeTar(t, nil),
})

err = sendPlan(sess, proto.WorkspaceTransition_START)
require.NoError(t, err)

found := false
for {
msg, err := sess.Recv()
require.NoError(t, err)

if c := msg.GetPlan(); c != nil {
require.Contains(t, c.Error, "exit status 1")
found = true
break
}
}
require.True(t, found)
require.EqualValues(t, 1, handlerCalled)
}

func TestProvision(t *testing.T) {
t.Parallel()

Expand Down
40 changes: 40 additions & 0 deletions provisioner/terraform/testdata/fake_text_file_busy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/sh

VERSION=$1
shift 1

json_print() {
echo "{\"@level\":\"error\",\"@message\":\"$*\"}"
}

case "$1" in
version)
cat <<-EOF
{
"terraform_version": "${VERSION}",
"platform": "linux_amd64",
"provider_selections": {},
"terraform_outdated": false
}
EOF
exit 0
;;
init)
echo "init"
echo >&2 "Error: Failed to install provider"
echo >&2 " Error while installing coder/coder v1.0.4: open"
echo >&2 " /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
echo >&2 " text file busy"
exit 1
;;
plan)
echo "plan not supported"
exit 1
;;
apply)
echo "apply not supported"
exit 1
;;
esac

exit 10
Loading