-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-15067: [C++] Add tracing spans to the scanner #11964
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bcc31f8
7515381
0c6af07
27e5ec1
d73e4ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,7 @@ | |
| #include "arrow/util/async_generator.h" | ||
| #include "arrow/util/iterator.h" | ||
| #include "arrow/util/logging.h" | ||
| #include "arrow/util/tracing_internal.h" | ||
| #include "arrow/util/utf8.h" | ||
|
|
||
| namespace arrow { | ||
|
|
@@ -148,9 +149,14 @@ static inline Result<csv::ReadOptions> GetReadOptions( | |
| static inline Future<std::shared_ptr<csv::StreamingReader>> OpenReaderAsync( | ||
| const FileSource& source, const CsvFileFormat& format, | ||
| const std::shared_ptr<ScanOptions>& scan_options, Executor* cpu_executor) { | ||
| #ifdef ARROW_WITH_OPENTELEMETRY | ||
| auto tracer = arrow::internal::tracing::GetTracer(); | ||
| auto span = tracer->StartSpan("arrow::dataset::CsvFileFormat::OpenReaderAsync"); | ||
| #endif | ||
| ARROW_ASSIGN_OR_RAISE(auto reader_options, GetReadOptions(format, scan_options)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Technically the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The span will be marked as finished in its destructor by default. Explicit marking is only required when you want to control the end time: https://github.com/open-telemetry/opentelemetry-cpp/blob/f20f72f3a904b215fc750b67b206f158aeb61241/sdk/src/trace/span.cc#L89-L92
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, that makes sense! |
||
|
|
||
| ARROW_ASSIGN_OR_RAISE(auto input, source.OpenCompressed()); | ||
| auto path = source.path(); | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| input, io::BufferedInputStream::Create(reader_options.block_size, | ||
| default_memory_pool(), std::move(input))); | ||
|
|
@@ -171,11 +177,20 @@ static inline Future<std::shared_ptr<csv::StreamingReader>> OpenReaderAsync( | |
| })); | ||
| return reader_fut.Then( | ||
| // Adds the filename to the error | ||
| [](const std::shared_ptr<csv::StreamingReader>& reader) | ||
| -> Result<std::shared_ptr<csv::StreamingReader>> { return reader; }, | ||
| [source](const Status& err) -> Result<std::shared_ptr<csv::StreamingReader>> { | ||
| return err.WithMessage("Could not open CSV input source '", source.path(), | ||
| "': ", err); | ||
| [=](const std::shared_ptr<csv::StreamingReader>& reader) | ||
| -> Result<std::shared_ptr<csv::StreamingReader>> { | ||
| #ifdef ARROW_WITH_OPENTELEMETRY | ||
| span->SetStatus(opentelemetry::trace::StatusCode::kOk); | ||
westonpace marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| span->End(); | ||
| #endif | ||
| return reader; | ||
| }, | ||
| [=](const Status& err) -> Result<std::shared_ptr<csv::StreamingReader>> { | ||
| #ifdef ARROW_WITH_OPENTELEMETRY | ||
| arrow::internal::tracing::MarkSpan(err, span.get()); | ||
| span->End(); | ||
| #endif | ||
| return err.WithMessage("Could not open CSV input source '", path, "': ", err); | ||
| }); | ||
| } | ||
|
|
||
|
|
@@ -276,7 +291,12 @@ Result<RecordBatchGenerator> CsvFileFormat::ScanBatchesAsync( | |
| auto source = file->source(); | ||
| auto reader_fut = | ||
| OpenReaderAsync(source, *this, scan_options, ::arrow::internal::GetCpuThreadPool()); | ||
| return GeneratorFromReader(std::move(reader_fut), scan_options->batch_size); | ||
| auto generator = GeneratorFromReader(std::move(reader_fut), scan_options->batch_size); | ||
| #ifdef ARROW_WITH_OPENTELEMETRY | ||
| generator = arrow::internal::tracing::WrapAsyncGenerator( | ||
| std::move(generator), "arrow::dataset::CsvFileFormat::ScanBatchesAsync::Next"); | ||
westonpace marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| #endif | ||
| return generator; | ||
| } | ||
|
|
||
| Future<util::optional<int64_t>> CsvFileFormat::CountRows( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,9 +85,10 @@ Iterator<T> WrapIterator( | |
|
|
||
| template <typename T> | ||
| AsyncGenerator<T> WrapAsyncGenerator(AsyncGenerator<T> wrapped, | ||
| opentelemetry::trace::StartSpanOptions options, | ||
| const std::string& span_name) { | ||
| return [=]() mutable -> Future<T> { | ||
| auto span = GetTracer()->StartSpan(span_name); | ||
| auto span = GetTracer()->StartSpan(span_name, {}, options); | ||
| auto scope = GetTracer()->WithActiveSpan(span); | ||
| auto fut = wrapped(); | ||
| fut.AddCallback([span](const Result<T>& result) { | ||
|
|
@@ -97,6 +98,58 @@ AsyncGenerator<T> WrapAsyncGenerator(AsyncGenerator<T> wrapped, | |
| return fut; | ||
| }; | ||
| } | ||
|
|
||
| /// \brief Start a new span for each invocation of a generator. | ||
| /// | ||
| /// The parent span of the new span will be the currently active span | ||
| /// (if any) as of when WrapAsyncGenerator was itself called. | ||
| template <typename T> | ||
| AsyncGenerator<T> WrapAsyncGenerator(AsyncGenerator<T> wrapped, | ||
| const std::string& span_name) { | ||
| opentelemetry::trace::StartSpanOptions options; | ||
| options.parent = GetTracer()->GetCurrentSpan()->GetContext(); | ||
| return WrapAsyncGenerator(std::move(wrapped), std::move(options), span_name); | ||
| } | ||
|
|
||
| /// \brief End the given span when the given async generator ends. | ||
| /// | ||
| /// The span will be made the active span each time the generator is called. | ||
| template <typename T> | ||
| AsyncGenerator<T> TieSpanToAsyncGenerator( | ||
| AsyncGenerator<T> wrapped, | ||
| opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span> span) { | ||
| return [=]() mutable -> Future<T> { | ||
| auto scope = GetTracer()->WithActiveSpan(span); | ||
| return wrapped().Then( | ||
| [span](const T& result) -> Result<T> { | ||
| span->SetStatus(opentelemetry::trace::StatusCode::kOk); | ||
| return result; | ||
| }, | ||
| [span](const Status& status) -> Result<T> { | ||
| MarkSpan(status, span.get()); | ||
| return status; | ||
| }); | ||
| }; | ||
| } | ||
|
|
||
| /// \brief Activate the given span on each invocation of an async generator. | ||
| template <typename T> | ||
| AsyncGenerator<T> PropagateSpanThroughAsyncGenerator( | ||
| AsyncGenerator<T> wrapped, | ||
| opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span> span) { | ||
| return [=]() mutable -> Future<T> { | ||
| auto scope = GetTracer()->WithActiveSpan(span); | ||
| return wrapped(); | ||
| }; | ||
| } | ||
|
Comment on lines
+136
to
+144
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second glance this helper method seems a little off to me. Is the "active span" a thread local concept? Will this work even if
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, active span is thread local. One way to get around this would be to instrument the Executor and possibly Future classes themselves, but I worry this would have more overhead than is desirable. (Or maybe not. I haven't tried.)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Concretely I'm thinking of... If you are I/O bound then I would expect
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think that's essentially the same (OpenTelemetry maintains a context which is thread-local by default, I think it can even be swapped out depending on how we want to go about things?). I'll try to take a look at this approach when I get a chance.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be done in a follow-up? I think, at the moment, the consequence would be that spans don't have proper parentage but other than that it should be fairly harmless.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think we can do that. If it pans out we can hopefully replace the manual instrumentation done here. I'll file a JIRA to explore this further.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| /// \brief Activate the given span on each invocation of an async generator. | ||
| template <typename T> | ||
| AsyncGenerator<T> PropagateSpanThroughAsyncGenerator(AsyncGenerator<T> wrapped) { | ||
| auto span = GetTracer()->GetCurrentSpan(); | ||
| if (!span->GetContext().IsValid()) return wrapped; | ||
| return PropagateSpanThroughAsyncGenerator(std::move(wrapped), std::move(span)); | ||
| } | ||
| #endif | ||
|
|
||
| } // namespace tracing | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we push this
ifdefintoStartSpanby returning a dummy span object with no-op methods?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could. I didn't want to wrap too much of the API, also, I figured this would be best if people were very concerned about overhead.