From e7b1325497ee469374be2ffb2ddcd1a85b69bad7 Mon Sep 17 00:00:00 2001 From: Tim Mwangi Date: Fri, 24 May 2024 11:36:09 -0700 Subject: [PATCH 1/3] fix: recover from exporter panic in our custom batch_span_processor --- .../batch_span_processor.modified.go | 10 ++ .../batch_span_processor_modified_test.go | 127 ++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go diff --git a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go index 82ab5060..00469592 100644 --- a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go +++ b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go @@ -16,6 +16,7 @@ package batchspanprocessor // import "github.com/hypertrace/goagent/instrumentat import ( "context" + "fmt" "sync" "sync/atomic" "time" @@ -285,6 +286,15 @@ func (bsp *batchSpanProcessor) exportSpans(ctx context.Context) error { if l := len(bsp.batch); l > 0 { Debug("exporting spans", "count", len(bsp.batch), "total_dropped", atomic.LoadUint32(&bsp.dropped)) + defer func() { + if r := recover(); r != nil { + Error(fmt.Errorf("panic value: %v", r), "recovering from a panic") + // Reset the batch if len is greater than 0 + if len(bsp.batch) > 0 { + bsp.batch = bsp.batch[:0] + } + } + }() err := bsp.e.ExportSpans(ctx, bsp.batch) // A new batch is always created after exporting, even if the batch failed to be exported. diff --git a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go new file mode 100644 index 00000000..c8187bae --- /dev/null +++ b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go @@ -0,0 +1,127 @@ +package batchspanprocessor + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/trace" +) + +const ( + panicSpanStr string = "panic_span" + tracerNameStr string = "tracer1" +) + +func TestCustomBspNonPanicExporterShouldNotPanic(t *testing.T) { + tracer, verifyFunc := setupTracer(t, true, false) + + startAndEndSpan(tracer, "span1") + startAndEndSpan(tracer, "span2") + startAndEndSpan(tracer, panicSpanStr) + time.Sleep(8 * time.Millisecond) + + verifyFunc(3) + startAndEndSpan(tracer, "span4") + time.Sleep(8 * time.Millisecond) + verifyFunc(4) +} + +func TestCustomBspPanicExporterShouldNotPanic(t *testing.T) { + tracer, verifyFunc := setupTracer(t, true, true) + + startAndEndSpan(tracer, "span1") + startAndEndSpan(tracer, "span2") + startAndEndSpan(tracer, panicSpanStr) + time.Sleep(8 * time.Millisecond) + verifyFunc(2) + + startAndEndSpan(tracer, "span4") + time.Sleep(8 * time.Millisecond) + verifyFunc(3) + + // Only span5 and span6 will be exported. span8 is discarded since the spans loop does not get to it before + // the panic + startAndEndSpan(tracer, "span5") + startAndEndSpan(tracer, "span6") + startAndEndSpan(tracer, panicSpanStr) + startAndEndSpan(tracer, "span8") + time.Sleep(8 * time.Millisecond) + verifyFunc(5) + + startAndEndSpan(tracer, "span9") + startAndEndSpan(tracer, "span10") + startAndEndSpan(tracer, "span11") + time.Sleep(8 * time.Millisecond) + verifyFunc(8) +} + +func TestCustomBspPanicExporterGoodSpansShouldNotPanic(t *testing.T) { + tracer, verifyFunc := setupTracer(t, true, false) + + startAndEndSpan(tracer, "span1") + startAndEndSpan(tracer, "span2") + startAndEndSpan(tracer, "span3") + time.Sleep(8 * time.Millisecond) + + verifyFunc(3) + + startAndEndSpan(tracer, "span4") + time.Sleep(8 * time.Millisecond) + verifyFunc(4) +} + +func setupTracer(t *testing.T, useCustomBsp bool, enablePanic bool) (trace.Tracer, func(int)) { + exporter := &mockPanickingSpanExporter{panics: enablePanic} + exportTimeout := 5 * time.Millisecond + bsp := CreateBatchSpanProcessor(useCustomBsp, exporter, + sdktrace.WithBatchTimeout(exportTimeout)) + assert.NotNil(t, bsp) + + sampler := sdktrace.AlwaysSample() + tp := sdktrace.NewTracerProvider( + sdktrace.WithSampler(sampler), + sdktrace.WithSpanProcessor(bsp), + ) + + return tp.Tracer(tracerNameStr), func(expectedExportedSpans int) { + verifyExporter(t, exporter, expectedExportedSpans) + } +} + +func verifyExporter(t *testing.T, e *mockPanickingSpanExporter, expectedSpansExporter int) { + assert.Equal(t, expectedSpansExporter, e.exportedCount) +} + +func startAndEndSpan(tracer trace.Tracer, spanName string) { + _, s := tracer.Start(context.Background(), spanName) + time.Sleep(1 * time.Millisecond) + s.End() +} + +type mockPanickingSpanExporter struct { + panics bool + exportedCount int +} + +func (e *mockPanickingSpanExporter) ExportSpans(_ context.Context, spans []sdktrace.ReadOnlySpan) error { + if !e.panics { + e.exportedCount = e.exportedCount + len(spans) + return nil + } + + for _, span := range spans { + if span.Name() == panicSpanStr { + panic("panic span in span list") + } else { + e.exportedCount = e.exportedCount + 1 + } + } + return nil +} + +func (e *mockPanickingSpanExporter) Shutdown(_ context.Context) error { + return nil +} From 65a1ddc0a4f7dcffb701034d9b67a8528848228d Mon Sep 17 00:00:00 2001 From: Tim Mwangi Date: Fri, 24 May 2024 13:10:10 -0700 Subject: [PATCH 2/3] add panic stacktrace to error log --- .../batchspanprocessor/batch_span_processor.modified.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go index 00469592..4792d945 100644 --- a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go +++ b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor.modified.go @@ -17,6 +17,7 @@ package batchspanprocessor // import "github.com/hypertrace/goagent/instrumentat import ( "context" "fmt" + "runtime/debug" "sync" "sync/atomic" "time" @@ -288,7 +289,7 @@ func (bsp *batchSpanProcessor) exportSpans(ctx context.Context) error { Debug("exporting spans", "count", len(bsp.batch), "total_dropped", atomic.LoadUint32(&bsp.dropped)) defer func() { if r := recover(); r != nil { - Error(fmt.Errorf("panic value: %v", r), "recovering from a panic") + Error(fmt.Errorf("panic value: %v.\n\n[stacktrace]:\n%s", r, string(debug.Stack())), "recovering from a panic") // Reset the batch if len is greater than 0 if len(bsp.batch) > 0 { bsp.batch = bsp.batch[:0] From ff994c1d636b1ca75af6243e21c350a0418e9d1d Mon Sep 17 00:00:00 2001 From: Tim Mwangi Date: Fri, 24 May 2024 13:57:54 -0700 Subject: [PATCH 3/3] increase sleep time to have a more stable test --- .../batch_span_processor_modified_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go index c8187bae..8305405c 100644 --- a/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go +++ b/instrumentation/opentelemetry/batchspanprocessor/batch_span_processor_modified_test.go @@ -35,11 +35,11 @@ func TestCustomBspPanicExporterShouldNotPanic(t *testing.T) { startAndEndSpan(tracer, "span1") startAndEndSpan(tracer, "span2") startAndEndSpan(tracer, panicSpanStr) - time.Sleep(8 * time.Millisecond) + time.Sleep(10 * time.Millisecond) verifyFunc(2) startAndEndSpan(tracer, "span4") - time.Sleep(8 * time.Millisecond) + time.Sleep(10 * time.Millisecond) verifyFunc(3) // Only span5 and span6 will be exported. span8 is discarded since the spans loop does not get to it before @@ -48,13 +48,13 @@ func TestCustomBspPanicExporterShouldNotPanic(t *testing.T) { startAndEndSpan(tracer, "span6") startAndEndSpan(tracer, panicSpanStr) startAndEndSpan(tracer, "span8") - time.Sleep(8 * time.Millisecond) + time.Sleep(10 * time.Millisecond) verifyFunc(5) startAndEndSpan(tracer, "span9") startAndEndSpan(tracer, "span10") startAndEndSpan(tracer, "span11") - time.Sleep(8 * time.Millisecond) + time.Sleep(10 * time.Millisecond) verifyFunc(8) }