Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/src/parquet/arrow/arrow-reader-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "parquet/arrow/schema.h"
#include "parquet/arrow/test-util.h"
#include "parquet/arrow/writer.h"
#include "parquet/column_writer.h"
#include "parquet/file_writer.h"
#include "parquet/test-util.h"

Expand Down Expand Up @@ -459,7 +460,7 @@ static std::shared_ptr<GroupNode> MakeSimpleSchema(const DataType& type,
case ::arrow::Type::DECIMAL: {
const auto& decimal_type =
static_cast<const ::arrow::Decimal128Type&>(values_type);
byte_width = DecimalSize(decimal_type.precision());
byte_width = internal::DecimalSize(decimal_type.precision());
} break;
default:
break;
Expand All @@ -470,7 +471,7 @@ static std::shared_ptr<GroupNode> MakeSimpleSchema(const DataType& type,
break;
case ::arrow::Type::DECIMAL: {
const auto& decimal_type = static_cast<const ::arrow::Decimal128Type&>(type);
byte_width = DecimalSize(decimal_type.precision());
byte_width = internal::DecimalSize(decimal_type.precision());
} break;
default:
break;
Expand Down
10 changes: 0 additions & 10 deletions cpp/src/parquet/arrow/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#include <algorithm>
#include <cstring>
#include <functional>
#include <future>
#include <numeric>
#include <utility>
Expand All @@ -33,13 +32,11 @@
#include "arrow/util/thread-pool.h"

#include "parquet/arrow/reader_internal.h"
#include "parquet/arrow/schema.h"
#include "parquet/column_reader.h"
#include "parquet/exception.h"
#include "parquet/file_reader.h"
#include "parquet/metadata.h"
#include "parquet/properties.h"
#include "parquet/schema-internal.h"
#include "parquet/schema.h"

using arrow::Array;
Expand Down Expand Up @@ -76,8 +73,6 @@ using parquet::internal::RecordReader;
namespace parquet {
namespace arrow {

class ColumnChunkReaderImpl;

class ColumnReaderImpl : public ColumnReader {
public:
enum ReaderType { PRIMITIVE, LIST, STRUCT };
Expand All @@ -91,11 +86,6 @@ class ColumnReaderImpl : public ColumnReader {
virtual ReaderType type() const = 0;
};

ArrowReaderProperties default_arrow_reader_properties() {
static ArrowReaderProperties default_reader_props;
return default_reader_props;
}

// ----------------------------------------------------------------------
// FileReaderImpl forward declaration

Expand Down
50 changes: 1 addition & 49 deletions cpp/src/parquet/arrow/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@

#include <cstdint>
#include <memory>
#include <unordered_set>
#include <vector>

#include "parquet/file_reader.h"
#include "parquet/platform.h"
#include "parquet/properties.h"

Expand All @@ -39,8 +39,6 @@ class Table;
namespace parquet {

class FileMetaData;
class ParquetFileReader;
class ReaderProperties;
class SchemaDescriptor;

namespace arrow {
Expand All @@ -49,52 +47,6 @@ class ColumnChunkReader;
class ColumnReader;
class RowGroupReader;

static constexpr bool DEFAULT_USE_THREADS = false;

// Default number of rows to read when using ::arrow::RecordBatchReader
static constexpr int64_t DEFAULT_BATCH_SIZE = 64 * 1024;

/// EXPERIMENTAL: Properties for configuring FileReader behavior.
class PARQUET_EXPORT ArrowReaderProperties {
public:
explicit ArrowReaderProperties(bool use_threads = DEFAULT_USE_THREADS)
: use_threads_(use_threads),
read_dict_indices_(),
batch_size_(DEFAULT_BATCH_SIZE) {}

void set_use_threads(bool use_threads) { use_threads_ = use_threads; }

bool use_threads() const { return use_threads_; }

void set_read_dictionary(int column_index, bool read_dict) {
if (read_dict) {
read_dict_indices_.insert(column_index);
} else {
read_dict_indices_.erase(column_index);
}
}
bool read_dictionary(int column_index) const {
if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
return true;
} else {
return false;
}
}

void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }

int64_t batch_size() const { return batch_size_; }

private:
bool use_threads_;
std::unordered_set<int> read_dict_indices_;
int64_t batch_size_;
};

/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
PARQUET_EXPORT
ArrowReaderProperties default_arrow_reader_properties();

// Arrow read adapter class for deserializing Parquet files as Arrow row
// batches.
//
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/parquet/arrow/reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,17 @@
#include <memory>
#include <string>
#include <type_traits>
#include <utility>
#include <vector>

#include <boost/algorithm/string/predicate.hpp>

#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/compute/kernel.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/int-util.h"
#include "arrow/util/logging.h"
Expand All @@ -42,6 +43,7 @@
#include "parquet/arrow/reader.h"
#include "parquet/column_reader.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/schema.h"
#include "parquet/types.h"

Expand Down
17 changes: 5 additions & 12 deletions cpp/src/parquet/arrow/reader_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,19 @@

#pragma once

#include <cstdint>
#include <deque>
#include <functional>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "arrow/status.h"

#include "parquet/column_reader.h"
#include "parquet/file_reader.h"
#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/schema.h"

namespace arrow {
Expand All @@ -36,26 +38,17 @@ class Array;
class ChunkedArray;
class DataType;
class Field;
class MemoryPool;
class Schema;

} // namespace arrow

using arrow::Status;

namespace parquet {

class ColumnDescriptor;

namespace internal {

class RecordReader;

} // namespace internal
class ArrowReaderProperties;

namespace arrow {

class ArrowReaderProperties;
class ColumnReaderImpl;

// ----------------------------------------------------------------------
Expand Down
79 changes: 1 addition & 78 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,13 @@
#include "parquet/arrow/schema.h"

#include <string>
#include <unordered_set>
#include <utility>
#include <vector>

#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"

#include "parquet/arrow/writer.h"
#include "parquet/exception.h"
#include "parquet/properties.h"
#include "parquet/schema-internal.h"
#include "parquet/types.h"

using arrow::Field;
Expand Down Expand Up @@ -262,7 +257,7 @@ Status FieldToNode(const std::shared_ptr<Field>& field,
static_cast<const ::arrow::Decimal128Type&>(*field->type());
precision = decimal_type.precision();
scale = decimal_type.scale();
length = DecimalSize(precision);
length = internal::DecimalSize(precision);
PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
} break;
case ArrowTypeId::DATE32:
Expand Down Expand Up @@ -351,77 +346,5 @@ Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
out);
}

/// \brief Compute the number of bytes required to represent a decimal of a
/// given precision. Taken from the Apache Impala codebase. The comments next
/// to the return values are the maximum value that can be represented in 2's
/// complement with the returned number of bytes.
int32_t DecimalSize(int32_t precision) {
DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
<< precision;
DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got "
<< precision;

switch (precision) {
case 1:
case 2:
return 1; // 127
case 3:
case 4:
return 2; // 32,767
case 5:
case 6:
return 3; // 8,388,607
case 7:
case 8:
case 9:
return 4; // 2,147,483,427
case 10:
case 11:
return 5; // 549,755,813,887
case 12:
case 13:
case 14:
return 6; // 140,737,488,355,327
case 15:
case 16:
return 7; // 36,028,797,018,963,967
case 17:
case 18:
return 8; // 9,223,372,036,854,775,807
case 19:
case 20:
case 21:
return 9; // 2,361,183,241,434,822,606,847
case 22:
case 23:
return 10; // 604,462,909,807,314,587,353,087
case 24:
case 25:
case 26:
return 11; // 154,742,504,910,672,534,362,390,527
case 27:
case 28:
return 12; // 39,614,081,257,132,168,796,771,975,167
case 29:
case 30:
case 31:
return 13; // 10,141,204,801,825,835,211,973,625,643,007
case 32:
case 33:
return 14; // 2,596,148,429,267,413,814,265,248,164,610,047
case 34:
case 35:
return 15; // 664,613,997,892,457,936,451,903,530,140,172,287
case 36:
case 37:
case 38:
return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727
default:
break;
}
DCHECK(false);
return -1;
}

} // namespace arrow
} // namespace parquet
10 changes: 1 addition & 9 deletions cpp/src/parquet/arrow/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,8 @@
#ifndef PARQUET_ARROW_SCHEMA_H
#define PARQUET_ARROW_SCHEMA_H

#include <cstdint>
#include <memory>
#include <vector>

#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/schema.h"

Expand All @@ -35,12 +32,11 @@ class Schema;

namespace parquet {

class ArrowWriterProperties;
class WriterProperties;

namespace arrow {

class ArrowWriterProperties;

PARQUET_EXPORT
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
const WriterProperties& properties,
Expand All @@ -58,11 +54,7 @@ ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
const WriterProperties& properties,
std::shared_ptr<SchemaDescriptor>* out);

PARQUET_EXPORT
int32_t DecimalSize(int32_t precision);

} // namespace arrow

} // namespace parquet

#endif // PARQUET_ARROW_SCHEMA_H
Loading