From 049877301e4367250f7fbac13be623dfad3bd052 Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Wed, 16 Oct 2024 14:29:09 +0400 Subject: [PATCH] chore: log provider stack traces on text file busy (#15078) re: #14726 If we see "text file busy" in the errors while initializing terraform, attempt to query the pprof endpoint set up by https://github.com/coder/terraform-provider-coder/pull/295 and log at CRITICAL. --------- Signed-off-by: Spike Curtis (cherry picked from commit d676ad56fe0913f2c9714f28d1fb51a5a57d2712) --- provisioner/terraform/executor.go | 45 ++++++++- provisioner/terraform/provision.go | 62 +++++++++++++ provisioner/terraform/provision_test.go | 93 ++++++++++++++++++- .../terraform/testdata/fake_text_file_busy.sh | 40 ++++++++ 4 files changed, 235 insertions(+), 5 deletions(-) create mode 100755 provisioner/terraform/testdata/fake_text_file_busy.sh diff --git a/provisioner/terraform/executor.go b/provisioner/terraform/executor.go index 08231489f40a2..916847e28cc58 100644 --- a/provisioner/terraform/executor.go +++ b/provisioner/terraform/executor.go @@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver return version.NewVersion(vj.Version) } +type textFileBusyError struct { + exitErr *exec.ExitError + stderr string +} + +func (e *textFileBusyError) Error() string { + return "text file busy: " + e.exitErr.String() +} + func (e *executor) init(ctx, killCtx context.Context, logr logSink) error { ctx, span := e.server.startTrace(ctx, tracing.FuncName()) defer span.End() @@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error { <-doneErr }() + // As a special case, we want to look for the error "text file busy" in the stderr output of + // the init command, so we also take a copy of the stderr into an in memory buffer. + errBuf := newBufferedWriteCloser(errWriter) + args := []string{ "init", "-no-color", "-input=false", } - return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter) + err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf) + var exitErr *exec.ExitError + if xerrors.As(err, &exitErr) { + if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) { + return &textFileBusyError{exitErr: exitErr, stderr: errBuf.b.String()} + } + } + return err } func getPlanFilePath(workdir string) string { @@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) { defer sw.mut.Unlock() return sw.w.Write(p) } + +type bufferedWriteCloser struct { + wc io.WriteCloser + b bytes.Buffer +} + +func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser { + return &bufferedWriteCloser{ + wc: wc, + } +} + +func (b *bufferedWriteCloser) Write(p []byte) (int, error) { + n, err := b.b.Write(p) + if err != nil { + return n, err + } + return b.wc.Write(p) +} + +func (b *bufferedWriteCloser) Close() error { + return b.wc.Close() +} diff --git a/provisioner/terraform/provision.go b/provisioner/terraform/provision.go index 9ff56de111662..67655ad140b7b 100644 --- a/provisioner/terraform/provision.go +++ b/provisioner/terraform/provision.go @@ -4,7 +4,11 @@ import ( "context" "encoding/json" "fmt" + "io" + "net" + "net/http" "os" + "path/filepath" "strings" "time" @@ -109,10 +113,32 @@ func (s *server) Plan( initTimings.ingest(createInitTimingsEvent(timingInitStart)) err = e.init(ctx, killCtx, sess) + if err != nil { initTimings.ingest(createInitTimingsEvent(timingInitErrored)) s.logger.Debug(ctx, "init failed", slog.Error(err)) + + // Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726 + // We believe this might be due to some race condition that prevents the + // terraform-provider-coder process from exiting. When terraform tries to install the + // provider during this init, it copies over the local cache. Normally this isn't an issue, + // but if the terraform-provider-coder process is still running from a previous build, Linux + // returns "text file busy" error when attempting to open the file. + // + // Capturing the stack trace from the process should help us figure out why it has not + // exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to + // notice, and also because it indicates this provisioner could be permanently broken and + // require a restart. + var errTFB *textFileBusyError + if xerrors.As(err, &errTFB) { + stacktrace := tryGettingCoderProviderStacktrace(sess) + s.logger.Critical(ctx, "init: text file busy", + slog.Error(errTFB), + slog.F("stderr", errTFB.stderr), + slog.F("provider_coder_stacktrace", stacktrace), + ) + } return provisionersdk.PlanErrorf("initialize terraform: %s", err) } @@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) { } } } + +// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to +// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which +// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us +// to figure out why it hasn't exited. +func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string { + path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof")) + sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path)) + c := http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + d := net.Dialer{} + return d.DialContext(ctx, "unix", path) + }, + }, + } + req, err := http.NewRequestWithContext(sess.Context(), http.MethodGet, + "http://localhost/debug/pprof/goroutine?debug=2", nil) + if err != nil { + sess.Logger.Error(sess.Context(), "error creating GET request", slog.Error(err)) + return "" + } + resp, err := c.Do(req) + if err != nil { + // Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder + // in v1.0.4 + sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err)) + return "" + } + defer resp.Body.Close() + stacktraces, err := io.ReadAll(resp.Body) + if err != nil { + sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err)) + } + return string(stacktraces) +} diff --git a/provisioner/terraform/provision_test.go b/provisioner/terraform/provision_test.go index 37ccaddbb2989..5ffec949afe17 100644 --- a/provisioner/terraform/provision_test.go +++ b/provisioner/terraform/provision_test.go @@ -9,6 +9,8 @@ import ( "encoding/json" "errors" "fmt" + "net" + "net/http" "os" "path/filepath" "runtime" @@ -31,6 +33,8 @@ import ( type provisionerServeOptions struct { binaryPath string exitTimeout time.Duration + workDir string + logger *slog.Logger } func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) { @@ -38,7 +42,13 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont opts = &provisionerServeOptions{} } cachePath := t.TempDir() - workDir := t.TempDir() + if opts.workDir == "" { + opts.workDir = t.TempDir() + } + if opts.logger == nil { + logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) + opts.logger = &logger + } client, server := drpc.MemTransportPipe() ctx, cancelFunc := context.WithCancel(context.Background()) serverErr := make(chan error, 1) @@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{ ServeOptions: &provisionersdk.ServeOptions{ Listener: server, - Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug), - WorkDirectory: workDir, + Logger: *opts.logger, + WorkDirectory: opts.workDir, }, BinaryPath: opts.binaryPath, CachePath: cachePath, @@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) { dir := t.TempDir() binPath := filepath.Join(dir, "terraform") - // Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@" + // Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@" content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String()) err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec require.NoError(t, err) @@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) { } } +// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than +// one process tries to do this, it can cause "text file busy" to be returned to us. In this test +// we want to simulate "text file busy" getting logged by terraform, due to an issue with the +// terraform-provider-coder +// nolint: paralleltest +func TestProvision_TextFileBusy(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("This test uses unix sockets and is not supported on Windows") + } + + cwd, err := os.Getwd() + require.NoError(t, err) + fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh") + + dir := t.TempDir() + binPath := filepath.Join(dir, "terraform") + + // Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@" + content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String()) + err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec + require.NoError(t, err) + + workDir := t.TempDir() + + err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700) + require.NoError(t, err) + l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof")) + require.NoError(t, err) + defer l.Close() + handlerCalled := 0 + // nolint: gosec + srv := &http.Server{ + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path) + w.WriteHeader(http.StatusOK) + _, err := w.Write([]byte("thestacks\n")) + assert.NoError(t, err) + handlerCalled++ + }), + } + srvErr := make(chan error, 1) + go func() { + srvErr <- srv.Serve(l) + }() + + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + ctx, api := setupProvisioner(t, &provisionerServeOptions{ + binaryPath: binPath, + exitTimeout: time.Second, + workDir: workDir, + logger: &logger, + }) + + sess := configure(ctx, t, api, &proto.Config{ + TemplateSourceArchive: makeTar(t, nil), + }) + + err = sendPlan(sess, proto.WorkspaceTransition_START) + require.NoError(t, err) + + found := false + for { + msg, err := sess.Recv() + require.NoError(t, err) + + if c := msg.GetPlan(); c != nil { + require.Contains(t, c.Error, "exit status 1") + found = true + break + } + } + require.True(t, found) + require.EqualValues(t, 1, handlerCalled) +} + func TestProvision(t *testing.T) { t.Parallel() diff --git a/provisioner/terraform/testdata/fake_text_file_busy.sh b/provisioner/terraform/testdata/fake_text_file_busy.sh new file mode 100755 index 0000000000000..341c8136c36c3 --- /dev/null +++ b/provisioner/terraform/testdata/fake_text_file_busy.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +VERSION=$1 +shift 1 + +json_print() { + echo "{\"@level\":\"error\",\"@message\":\"$*\"}" +} + +case "$1" in +version) + cat <<-EOF + { + "terraform_version": "${VERSION}", + "platform": "linux_amd64", + "provider_selections": {}, + "terraform_outdated": false + } + EOF + exit 0 + ;; +init) + echo "init" + echo >&2 "Error: Failed to install provider" + echo >&2 " Error while installing coder/coder v1.0.4: open" + echo >&2 " /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:" + echo >&2 " text file busy" + exit 1 + ;; +plan) + echo "plan not supported" + exit 1 + ;; +apply) + echo "apply not supported" + exit 1 + ;; +esac + +exit 10