apache · joosthooz · Feb 13, 2023 · Feb 13, 2023 · Feb 13, 2023 · Feb 13, 2023
@@ -73,12 +73,12 @@ class FilterNode : public MapNode {
   Result<ExecBatch> ProcessBatch(ExecBatch batch) override {
     ARROW_ASSIGN_OR_RAISE(Expression simplified_filter,
                           SimplifyWithGuarantee(filter_, batch.guarantee));
-
     arrow::util::tracing::Span span;
     START_COMPUTE_SPAN(span, "Filter",
                        {{"filter.expression", ToStringExtra()},
                         {"filter.expression.simplified", simplified_filter.ToString()},
-                        {"filter.length", batch.length}});
+                        {"filter.length", batch.length},
+                        {"input_batch.size_bytes", batch.TotalBufferSize()}});
 
     ARROW_ASSIGN_OR_RAISE(
         Datum mask, ExecuteScalarExpression(simplified_filter, batch,
@@ -87,8 +87,10 @@ class FilterNode : public MapNode {
     if (mask.is_scalar()) {
       const auto& mask_scalar = mask.scalar_as<BooleanScalar>();
       if (mask_scalar.is_valid && mask_scalar.value) {
+        ATTRIBUTE_ON_CURRENT_SPAN("output_batch.size_bytes", batch.TotalBufferSize());
         return batch;
       }
+      ATTRIBUTE_ON_CURRENT_SPAN("output_batch.size_bytes", 0);
       return batch.Slice(0, 0);
     }
 
@@ -101,7 +103,10 @@ class FilterNode : public MapNode {
       if (value.is_scalar()) continue;
       ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults()));
     }
-    return ExecBatch::Make(std::move(values));
+    auto filtered_batch = ExecBatch::Make(std::move(values));
+    ATTRIBUTE_ON_CURRENT_SPAN("output_batch.size_bytes",
+                              filtered_batch->TotalBufferSize());
+    return filtered_batch;
   }
 
  protected:

@@ -79,19 +79,22 @@ class ProjectNode : public MapNode {
 
   Result<ExecBatch> ProcessBatch(ExecBatch batch) override {
     std::vector<Datum> values{exprs_.size()};
+    arrow::util::tracing::Span span;
+    START_COMPUTE_SPAN(span, "Project",
+                       {{"project.length", batch.length},
+                        {"input_batch.size_bytes", batch.TotalBufferSize()}});
     for (size_t i = 0; i < exprs_.size(); ++i) {
-      arrow::util::tracing::Span span;
-      START_COMPUTE_SPAN(span, "Project",
-                         {{"project.type", exprs_[i].type()->ToString()},
-                          {"project.length", batch.length},
-                          {"project.expression", exprs_[i].ToString()}});
+      std::string project_name = "project[" + std::to_string(i) + "]";
+      ATTRIBUTE_ON_CURRENT_SPAN(project_name + ".type", exprs_[i].type()->ToString());
+      ATTRIBUTE_ON_CURRENT_SPAN(project_name + ".expression", exprs_[i].ToString());
       ARROW_ASSIGN_OR_RAISE(Expression simplified_expr,
                             SimplifyWithGuarantee(exprs_[i], batch.guarantee));
 
       ARROW_ASSIGN_OR_RAISE(
           values[i], ExecuteScalarExpression(simplified_expr, batch,
                                              plan()->query_context()->exec_context()));
     }
+    ATTRIBUTE_ON_CURRENT_SPAN("output_batch.size_bytes", batch.TotalBufferSize());
     return ExecBatch{std::move(values), batch.length};
   }
 

@@ -539,6 +539,7 @@ struct OrderBySinkNode final : public SinkNode {
 
   Status Finish() override {
     arrow::util::tracing::Span span;
+    START_SPAN(span, std::string(kind_name()) + "::Finish");
     ARROW_RETURN_NOT_OK(DoFinish());
     return SinkNode::Finish();
   }

@@ -133,6 +133,8 @@ struct SourceNode : ExecNode, public TracedNode {
     plan_->query_context()->ScheduleTask(
         [this, morsel_length, use_legacy_batching, initial_batch_index, morsel,
          has_ordering = !ordering_.is_unordered()]() {
+          arrow::util::tracing::Span span;
+          START_SPAN(span, "SourceNode::ProcessMorsel");
           int64_t offset = 0;
           int batch_index = initial_batch_index;
           do {
@@ -163,6 +165,7 @@ struct SourceNode : ExecNode, public TracedNode {
 
   Status StartProducing() override {
     NoteStartProducing(ToStringExtra());
+
     {
       // If another exec node encountered an error during its StartProducing call
       // it might have already called StopProducing on all of its inputs (including this
@@ -184,6 +187,9 @@ struct SourceNode : ExecNode, public TracedNode {
     options.should_schedule = ShouldSchedule::IfDifferentExecutor;
     ARROW_ASSIGN_OR_RAISE(Future<> scan_task, plan_->query_context()->BeginExternalTask(
                                                   "SourceNode::DatasetScan"));
+    arrow::util::tracing::Span span;
+    START_SPAN(span, "SourceNode::DatasetScan");
+
     if (!scan_task.is_valid()) {
       // Plan has already been aborted, no need to start scanning
       return Status::OK();
@@ -195,9 +201,6 @@ struct SourceNode : ExecNode, public TracedNode {
       }
       lock.unlock();
 
-      arrow::util::tracing::Span fetch_batch_span;
-      auto fetch_batch_scope =
-          START_SCOPED_SPAN(fetch_batch_span, "SourceNode::ReadBatch");
       return generator_().Then(
           [this](
               const std::optional<ExecBatch>& morsel_or_end) -> Future<ControlFlow<int>> {

diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
@@ -219,7 +219,7 @@ struct FunctionExecutorImpl : public FunctionExecutor {
   }
 
   Result<Datum> Execute(const std::vector<Datum>& args, int64_t passed_length) override {
-    util::tracing::Span span;
+    arrow::util::tracing::Span span;
 
     auto func_kind = func.kind();
     const auto& func_name = func.name();

diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
@@ -43,12 +43,14 @@
 #include "arrow/type.h"
 #include "arrow/type_fwd.h"
 #include "arrow/util/async_generator.h"
+#include "arrow/util/byte_size.h"
 #include "arrow/util/future.h"
 #include "arrow/util/iterator.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/task_group.h"
 #include "arrow/util/thread_pool.h"
+#include "arrow/util/tracing_internal.h"
 #include "arrow/util/utf8_internal.h"
 #include "arrow/util/vector.h"
 
@@ -881,6 +883,8 @@ class StreamingReaderImpl : public ReaderMixin,
   }
 
   Future<std::shared_ptr<RecordBatch>> ReadNextAsync() override {
+    util::tracing::Span span;
+    START_SPAN(span, "arrow::csv::ReadNextAsync");
     return record_batch_gen_();
   }
 
@@ -892,6 +896,11 @@ class StreamingReaderImpl : public ReaderMixin,
       return Status::Invalid("Empty CSV file");
     }
 
+    // Create a arrow::csv::ReadNextAsync span so that grouping by that name does not
+    // ignore the work performed for this first block.
+    util::tracing::Span read_span;
+    auto scope = START_SCOPED_SPAN(read_span, "arrow::csv::ReadNextAsync");
+
     std::shared_ptr<Buffer> after_header;
     ARROW_ASSIGN_OR_RAISE(auto header_bytes_consumed,
                           ProcessHeader(first_buffer, &after_header));
@@ -911,9 +920,12 @@ class StreamingReaderImpl : public ReaderMixin,
     auto rb_gen = MakeMappedGenerator(std::move(parsed_block_gen), std::move(decoder_op));
 
     auto self = shared_from_this();
-    return rb_gen().Then([self, rb_gen, max_readahead](const DecodedBlock& first_block) {
+    auto init_finished = rb_gen().Then([self, rb_gen, max_readahead,
+                                        read_span = std::move(read_span)
+    ](const DecodedBlock& first_block) {
       return self->InitFromBlock(first_block, std::move(rb_gen), max_readahead, 0);
     });
+    return init_finished;
   }
 
   Future<> InitFromBlock(const DecodedBlock& block,

diff --git a/cpp/src/arrow/dataset/dataset_internal.h b/cpp/src/arrow/dataset/dataset_internal.h
@@ -29,8 +29,10 @@
 #include "arrow/scalar.h"
 #include "arrow/type.h"
 #include "arrow/util/async_generator.h"
+#include "arrow/util/byte_size.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/iterator.h"
+#include "arrow/util/tracing_internal.h"
 
 namespace arrow {
 namespace dataset {
@@ -144,6 +146,12 @@ inline RecordBatchGenerator MakeChunkedBatchGenerator(RecordBatchGenerator gen,
       [batch_size](const std::shared_ptr<RecordBatch>& batch)
           -> ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>> {
         const int64_t rows = batch->num_rows();
+        util::tracing::Span span;
+        START_SPAN(span, "MakeChunkedBatchGenerator",
+                   {{"target_batch_size_rows", batch_size},
+                    {"batch.size_rows", rows},
+                    {"batch.size_bytes", util::TotalBufferSize(*batch)},
+                    {"output_batches", rows / batch_size + (rows % batch_size != 0)}});
         if (rows <= batch_size) {
           return ::arrow::MakeVectorGenerator<std::shared_ptr<RecordBatch>>({batch});
         }

diff --git a/cpp/src/arrow/dataset/dataset_writer.cc b/cpp/src/arrow/dataset/dataset_writer.cc
@@ -27,6 +27,7 @@
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
 #include "arrow/table.h"
+#include "arrow/util/byte_size.h"
 #include "arrow/util/future.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/map.h"
@@ -191,9 +192,13 @@ class DatasetWriterFileQueue {
   }
 
   Result<int64_t> PopAndDeliverStagedBatch() {
+    util::tracing::Span span;
+    START_SPAN(span, "DatasetWriter::Pop");
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> next_batch, PopStagedBatch());
     int64_t rows_popped = next_batch->num_rows();
     rows_currently_staged_ -= next_batch->num_rows();
+    ATTRIBUTE_ON_CURRENT_SPAN("batch.size_rows", next_batch->num_rows());
+    ATTRIBUTE_ON_CURRENT_SPAN("rows_currently_staged", rows_currently_staged_);
     ScheduleBatch(std::move(next_batch));
     return rows_popped;
   }
@@ -202,7 +207,15 @@ class DatasetWriterFileQueue {
   Status Push(std::shared_ptr<RecordBatch> batch) {
     uint64_t delta_staged = batch->num_rows();
     rows_currently_staged_ += delta_staged;
-    staged_batches_.push_back(std::move(batch));
+    {
+      util::tracing::Span span;
+      START_SPAN(span, "DatasetWriter::Push",
+                 {{"batch.size_rows", batch->num_rows()},
+                  {"rows_currently_staged", rows_currently_staged_},
+                  {"options_.min_rows_per_group", options_.min_rows_per_group},
+                  {"max_rows_staged", writer_state_->max_rows_staged}});
+      staged_batches_.push_back(std::move(batch));
+    }
     while (!staged_batches_.empty() &&
            (writer_state_->StagingFull() ||
             rows_currently_staged_ >= options_.min_rows_per_group)) {
@@ -233,6 +246,18 @@ class DatasetWriterFileQueue {
     return DeferNotOk(options_.filesystem->io_context().executor()->Submit(
         [self = this, batch = std::move(next)]() {
           int64_t rows_to_release = batch->num_rows();
+#ifdef ARROW_WITH_OPENTELEMETRY
+          uint64_t size_bytes = util::TotalBufferSize(*batch);
+          uint64_t num_buffers = 0;
+          for (auto column : batch->columns()) {
+            num_buffers += column->data()->buffers.size();
+          }
+          util::tracing::Span span;
+          START_SPAN(span, "DatasetWriter::WriteNext",
+                     {{"threadpool", "IO"},
+                      {"batch.size_bytes", size_bytes},
+                      {"batch.num_buffers", num_buffers}});
+#endif
           Status status = self->writer_->Write(batch);
           self->writer_state_->rows_in_flight_throttle.Release(rows_to_release);
           return status;
@@ -261,11 +286,6 @@ class DatasetWriterFileQueue {
   util::AsyncTaskScheduler* file_tasks_ = nullptr;
 };
 
-struct WriteTask {
-  std::string filename;
-  uint64_t num_rows;
-};
-
 class DatasetWriterDirectoryQueue {
  public:
   DatasetWriterDirectoryQueue(util::AsyncTaskScheduler* scheduler, std::string directory,
@@ -301,7 +321,6 @@ class DatasetWriterDirectoryQueue {
 
   Status StartWrite(const std::shared_ptr<RecordBatch>& batch) {
     rows_written_ += batch->num_rows();
-    WriteTask task{current_filename_, static_cast<uint64_t>(batch->num_rows())};
     if (!latest_open_file_) {
       ARROW_RETURN_NOT_OK(OpenFileQueue(current_filename_));
     }
@@ -351,6 +370,8 @@ class DatasetWriterDirectoryQueue {
     latest_open_file_tasks_ = util::MakeThrottledAsyncTaskGroup(
         scheduler_, 1, /*queue=*/nullptr, std::move(file_finish_task));
     if (init_future_.is_valid()) {
+      util::tracing::Span span;
+      START_SPAN(span, "arrow::dataset::WaitForDirectoryInit");
       latest_open_file_tasks_->AddSimpleTask(
           [init_future = init_future_]() { return init_future; },
           "DatasetWriter::WaitForDirectoryInit"sv);
@@ -362,6 +383,8 @@ class DatasetWriterDirectoryQueue {
   uint64_t rows_written() const { return rows_written_; }
 
   void PrepareDirectory() {
+    util::tracing::Span span;
+    START_SPAN(span, "arrow::dataset::SubmitPrepareDirectoryTask");
     if (directory_.empty() || !write_options_.create_dir) {
       return;
     }
@@ -383,6 +406,8 @@ class DatasetWriterDirectoryQueue {
     if (write_options_.existing_data_behavior ==
         ExistingDataBehavior::kDeleteMatchingPartitions) {
       init_task = [this, create_dir_cb, notify_waiters_cb, notify_waiters_on_err_cb] {
+        util::tracing::Span span;
+        START_SPAN(span, "arrow::dataset::PrepareDirectory");
         return write_options_.filesystem
             ->DeleteDirContentsAsync(directory_,
                                      /*missing_dir_ok=*/true)
@@ -614,12 +639,14 @@ class DatasetWriter::DatasetWriterImpl {
       backpressure =
           writer_state_.rows_in_flight_throttle.Acquire(next_chunk->num_rows());
       if (!backpressure.is_finished()) {
+        EVENT(scheduler_->span(), "DatasetWriter::Backpressure::TooManyRowsQueued");
         EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyRowsQueued");
         break;
       }
       if (will_open_file) {
         backpressure = writer_state_.open_files_throttle.Acquire(1);
         if (!backpressure.is_finished()) {
+          EVENT(scheduler_->span(), "DatasetWriter::Backpressure::TooManyOpenFiles");
           EVENT_ON_CURRENT_SPAN("DatasetWriter::Backpressure::TooManyOpenFiles");
           RETURN_NOT_OK(TryCloseLargestFile());
           break;

diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
@@ -278,10 +278,6 @@ static inline Result<csv::ReadOptions> GetReadOptions(
 static inline Future<std::shared_ptr<csv::StreamingReader>> OpenReaderAsync(
     const FileSource& source, const CsvFileFormat& format,
     const std::shared_ptr<ScanOptions>& scan_options, Executor* cpu_executor) {
-#ifdef ARROW_WITH_OPENTELEMETRY
-  auto tracer = arrow::internal::tracing::GetTracer();
-  auto span = tracer->StartSpan("arrow::dataset::CsvFileFormat::OpenReaderAsync");
-#endif
   ARROW_ASSIGN_OR_RAISE(
       auto fragment_scan_options,
       GetFragmentScanOptions<CsvFragmentScanOptions>(
@@ -300,31 +296,24 @@ static inline Future<std::shared_ptr<csv::StreamingReader>> OpenReaderAsync(
   // input->Peek call blocks so we run the whole thing on the I/O thread pool.
   auto reader_fut = DeferNotOk(input->io_context().executor()->Submit(
       [=]() -> Future<std::shared_ptr<csv::StreamingReader>> {
+
         ARROW_ASSIGN_OR_RAISE(auto first_block, input->Peek(reader_options.block_size));
         const auto& parse_options = format.parse_options;
         ARROW_ASSIGN_OR_RAISE(
             auto convert_options,
             GetConvertOptions(format, scan_options ? scan_options.get() : nullptr,
                               first_block));
-        return csv::StreamingReader::MakeAsync(io::default_io_context(), std::move(input),
-                                               cpu_executor, reader_options,
-                                               parse_options, convert_options);
+        return csv::StreamingReader::MakeAsync(
+            io::default_io_context(), std::move(input), cpu_executor, reader_options,
+            parse_options, convert_options);
       }));
   return reader_fut.Then(
       // Adds the filename to the error
       [=](const std::shared_ptr<csv::StreamingReader>& reader)
           -> Result<std::shared_ptr<csv::StreamingReader>> {
-#ifdef ARROW_WITH_OPENTELEMETRY
-        span->SetStatus(opentelemetry::trace::StatusCode::kOk);
-        span->End();
-#endif
         return reader;
       },
       [=](const Status& err) -> Result<std::shared_ptr<csv::StreamingReader>> {
-#ifdef ARROW_WITH_OPENTELEMETRY
-        arrow::internal::tracing::MarkSpan(err, span.get());
-        span->End();
-#endif
         return err.WithMessage("Could not open CSV input source '", path, "': ", err);
       });
 }
@@ -384,8 +373,6 @@ Result<RecordBatchGenerator> CsvFileFormat::ScanBatchesAsync(
   auto reader_fut =
       OpenReaderAsync(source, *this, scan_options, ::arrow::internal::GetCpuThreadPool());
   auto generator = GeneratorFromReader(std::move(reader_fut), scan_options->batch_size);
-  WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN(
-      generator, "arrow::dataset::CsvFileFormat::ScanBatchesAsync::Next");
   return generator;
 }
 

diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
@@ -57,6 +57,7 @@
 #include "arrow/util/parallel.h"
 #include "arrow/util/string.h"
 #include "arrow/util/thread_pool.h"
+#include "arrow/util/tracing_internal.h"
 #include "arrow/util/ubsan.h"
 #include "arrow/util/vector.h"
 #include "arrow/visit_type_inline.h"
@@ -523,6 +524,12 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op
 
   return ::arrow::internal::OptionalParallelFor(
       options.use_threads, static_cast<int>(buffers.size()), [&](int i) {
+        util::tracing::Span span;
+        START_SPAN(span, "arrow::ipc::DecompressBuffer",
+                   {{"buffer_index", i},
+                    {"ipc.compression.codec", codec.get()->name().c_str()},
+                    {"ipc.options.use_threads", options.use_threads},
+                    {"size.uncompressed", (*buffers[i])->size() - sizeof(int64_t)}});
         ARROW_ASSIGN_OR_RAISE(*buffers[i],
                               DecompressBuffer(*buffers[i], options, codec.get()));
         return Status::OK();