sourcegraph · bobheadxi · Nov 30, 2022 · Nov 30, 2022 · Nov 30, 2022 · Nov 30, 2022
diff --git a/cmd/src/snapshot_upload.go b/cmd/src/snapshot_upload.go
@@ -36,6 +36,7 @@ BUCKET
 	flagSet := flag.NewFlagSet("upload", flag.ExitOnError)
 	bucketName := flagSet.String("bucket", "", "destination Cloud Storage bucket name")
 	credentialsPath := flagSet.String("credentials", "", "JSON credentials file for Google Cloud service account")
+	trimExtensions := flagSet.Bool("trim-extensions", true, "trim EXTENSION statements from database dumps for import to Google Cloud SQL")
 
 	snapshotCommands = append(snapshotCommands, &command{
 		flagSet: flagSet,
@@ -59,8 +60,9 @@ BUCKET
 			}
 
 			type upload struct {
-				file *os.File
-				stat os.FileInfo
+				file           *os.File
+				stat           os.FileInfo
+				trimExtensions bool
 			}
 			var (
 				uploads      []upload             // index aligned with progressBars
@@ -76,8 +78,9 @@ BUCKET
 					return errors.Wrap(err, "get file size")
 				}
 				uploads = append(uploads, upload{
-					file: f,
-					stat: stat,
+					file:           f,
+					stat:           stat,
+					trimExtensions: false, // not a database dump
 				})
 				progressBars = append(progressBars, output.ProgressBar{
 					Label: stat.Name(),
@@ -95,8 +98,9 @@ BUCKET
 						return errors.Wrap(err, "get file size")
 					}
 					uploads = append(uploads, upload{
-						file: f,
-						stat: stat,
+						file:           f,
+						stat:           stat,
+						trimExtensions: *trimExtensions,
 					})
 					progressBars = append(progressBars, output.ProgressBar{
 						Label: stat.Name(),
@@ -116,7 +120,7 @@ BUCKET
 				g.Go(func(ctx context.Context) error {
 					progressFn := func(p int64) { progress.SetValue(i, float64(p)) }
 
-					if err := copyToBucket(ctx, u.file, u.stat, bucket, progressFn); err != nil {
+					if err := copyDumpToBucket(ctx, u.file, u.stat, bucket, progressFn, u.trimExtensions); err != nil {
 						return errors.Wrap(err, u.stat.Name())
 					}
 
@@ -139,26 +143,43 @@ BUCKET
 	})
 }
 
-func copyToBucket(ctx context.Context, src io.Reader, stat fs.FileInfo, dst *storage.BucketHandle, progressFn func(int64)) error {
-	writer := dst.Object(stat.Name()).NewWriter(ctx)
-	writer.ProgressFunc = progressFn
-	defer writer.Close()
+func copyDumpToBucket(ctx context.Context, src io.ReadSeeker, stat fs.FileInfo, dst *storage.BucketHandle, progressFn func(int64), trimExtensions bool) error {
+	// Set up object to write to
+	object := dst.Object(stat.Name()).NewWriter(ctx)
+	object.ProgressFunc = progressFn
+	defer object.Close()
+
+	// To assert against actual file size
+	var totalWritten int64
+
+	// Do a partial copy that trims out unwanted statements
+	if trimExtensions {
+		written, err := pgdump.PartialCopyWithoutExtensions(object, src, progressFn)
+		if err != nil {
+			return errors.Wrap(err, "trim extensions and upload")
+		}
+		totalWritten += written
+	}
 
 	// io.Copy is the best way to copy from a reader to writer in Go, and storage.Writer
-	// has its own chunking mechanisms internally.
-	written, err := io.Copy(writer, src)
+	// has its own chunking mechanisms internally. io.Reader is stateful, so this copy
+	// will just continue from where we left off if we use copyAndTrimExtensions.
+	written, err := io.Copy(object, src)
 	if err != nil {
-		return err
+		return errors.Wrap(err, "upload")
 	}
+	totalWritten += written
 
-	// Progress is not called on completion, so we call it manually after io.Copy is done
+	// Progress is not called on completion of io.Copy, so we call it manually after to
+	// update our pretty progress bars.
 	progressFn(written)
 
-	// Validate we have sent all data
+	// Validate we have sent all data. copyAndTrimExtensions may add some bytes, so the
+	// check is not a strict equality.
 	size := stat.Size()
-	if written != size {
-		return errors.Newf("expected to write %d bytes, but actually wrote %d bytes",
-			size, written)
+	if totalWritten < size {
+		return errors.Newf("expected to write %d bytes, but actually wrote %d bytes (diff: %d bytes)",
+			size, totalWritten, totalWritten-size)
 	}
 
 	return nil

diff --git a/internal/pgdump/extensions.go b/internal/pgdump/extensions.go
@@ -0,0 +1,67 @@
+package pgdump
+
+import (
+	"bufio"
+	"bytes"
+	"io"
+
+	"github.com/sourcegraph/sourcegraph/lib/errors"
+)
+
+// PartialCopyWithoutExtensions will perform a partial copy of a SQL database dump from
+// src to dst while commenting out EXTENSIONs-related statements. When it determines there
+// are no more EXTENSIONs-related statements, it will return, resetting src to the position
+// of the last contents written to dst.
+//
+// This is needed for import to Google Cloud Storage, which does not like many EXTENSION
+// statements. For more details, see https://cloud.google.com/sql/docs/postgres/import-export/import-export-dmp
+//
+// Filtering requires reading entire lines into memory - this can be a very expensive
+// operation, so when filtering is complete the more efficient io.Copy should be used
+// to perform the remainder of the copy from src to dst.
+func PartialCopyWithoutExtensions(dst io.Writer, src io.ReadSeeker, progressFn func(int64)) (int64, error) {
+	var (
+		reader = bufio.NewReader(src)
+		// position we have consumed up to, track separately because bufio.Reader may have
+		// read ahead on src. This allows us to reset src later.
+		consumed int64
+		// number of bytes we have actually written to dst - it should always be returned.
+		written int64
+		// set to true when we have done all our filtering
+		noMoreExtensions bool
+	)
+
+	for !noMoreExtensions {
+		// Read up to a line, keeping track of our position in src
+		line, err := reader.ReadBytes('\n')
+		consumed += int64(len(line))
+		if err != nil {
+			return written, err
+		}
+
+		// Once we start seeing table creations, we are definitely done with extensions,
+		// so we can hand off the rest to the superior io.Copy implementation.
+		if bytes.HasPrefix(line, []byte("CREATE TABLE")) {
+			// we are done with extensions
+			noMoreExtensions = true
+		} else if bytes.HasPrefix(line, []byte("COMMENT ON EXTENSION")) {
+			// comment out this line
+			line = append([]byte("-- "), line...)
+		}
+
+		// Write this line and update our progress before returning on error
+		lineWritten, err := dst.Write(line)
+		written += int64(lineWritten)
+		progressFn(written)
+		if err != nil {
+			return written, err
+		}
+	}
+
+	// No more extensions - reset src to the last actual consumed position
+	_, err := src.Seek(consumed, io.SeekStart)
+	if err != nil {
+		return written, errors.Wrap(err, "reset src position")
+	}
+	return written, nil
+}
diff --git a/internal/pgdump/extensions_test.go b/internal/pgdump/extensions_test.go
@@ -0,0 +1,70 @@
+package pgdump
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+
+	"github.com/hexops/autogold"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestPartialCopyWithoutExtensions(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("Test doesn't work on Windows of weirdness with t.TempDir() handling")
+	}
+
+	// Create test data - there is no stdlib in-memory io.ReadSeeker implementation
+	src, err := os.Create(filepath.Join(t.TempDir(), t.Name()))
+	require.NoError(t, err)
+	_, err = src.WriteString(`-- Some comment
+
+CREATE EXTENSION foobar
+
+COMMENT ON EXTENSION barbaz
+
+CREATE TYPE asdf
+
+CREATE TABLE robert (
+	...
+)
+
+CREATE TABLE bobhead (
+	...
+)`)
+	require.NoError(t, err)
+	_, err = src.Seek(0, io.SeekStart)
+	require.NoError(t, err)
+
+	// Set up target to assert against
+	var dst bytes.Buffer
+
+	// Perform partial copy
+	_, err = PartialCopyWithoutExtensions(&dst, src, func(i int64) {})
+	assert.NoError(t, err)
+
+	// Copy rest of contents
+	_, err = io.Copy(&dst, src)
+	assert.NoError(t, err)
+
+	// Assert contents (update with -update)
+	autogold.Want("partial-copy-without-extensions", `-- Some comment
+
+CREATE EXTENSION foobar
+
+-- COMMENT ON EXTENSION barbaz
+
+CREATE TYPE asdf
+
+CREATE TABLE robert (
+	...
+)
+
+CREATE TABLE bobhead (
+	...
+)`).Equal(t, dst.String())
+}