Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
21fc450
Add some basic unittests that exercise the DecodeArrow methods
Feb 14, 2019
fa50415
Implement DecodeArrowNonNull and unit tests
Feb 14, 2019
4a26f74
remove todo message
Feb 14, 2019
31667ff
added tests for DictByteArrayDecoder and reworked previous tests
Feb 18, 2019
ef55081
added benchmarks for decoding plain encoded data using arrow builders
Feb 18, 2019
89de5d5
rework data generation so that decoding benchmark runs using a more r…
Feb 19, 2019
39a5f19
fix appveyor windows failure
Feb 21, 2019
4fbcf1f
fix loop increment in templated PlainByteArrayDecoder::DecodeArrow me…
Feb 21, 2019
f234ca2
respond to code review feedback - many readability fixes in benchmark…
Feb 22, 2019
84df23b
prefer range-based for loop to KeepRunning while loop pattern
Feb 22, 2019
78eddb8
Use value parameterization in decoding tests
Feb 26, 2019
ff38021
prefer mersenne twister prng over default one which is implemenation …
hatemhelal Feb 27, 2019
b16eaa9
prefer default_random_engine to avoid potential slowdown with Mersenn…
Feb 27, 2019
8f59198
Add overloads for decoding using a StringDictionaryBuilder
hatemhelal Feb 28, 2019
28d76b7
Add benchmark for dictionary decoding using arrow builder
hatemhelal Feb 28, 2019
a8c1535
Add support for requesting a parquet column be read as a DictionaryArray
hatemhelal Mar 5, 2019
a6740f3
Make sure to update the schema when reading a column as a DictionaryA…
hatemhelal Mar 5, 2019
a357544
Basic unittests for reading DictionaryArray directly from parquet
hatemhelal Mar 5, 2019
077a8f1
Move function definition to (hopefully) resolve appveyor build failur…
hatemhelal Mar 6, 2019
7aac84c
Reworked encoding benchmark to reduce code duplication
hatemhelal Mar 9, 2019
a267a27
remove unnecessary inlines
Mar 11, 2019
babe52e
replace deprecated ReadableFileInterface with RandomAccessFile
Mar 11, 2019
6e65fdb
simplify ArrowReaderProperties and mark as experimental
Mar 11, 2019
4d7bb30
Refactor dictionary data generation into RandomArrayGenerator
hatemhelal Mar 12, 2019
5fb9e86
Rework parquet encoding tests
Mar 12, 2019
7347cfa
Fix DecodeArrow from plain encoded columns
Mar 12, 2019
9da1331
Temporarily disable tests for arrow builder decoding from dictionary …
Mar 12, 2019
e6ca0db
Fix DictEncoding test: need to use PutSpaced instead of Put in setup
hatemhelal Mar 13, 2019
7719b94
Use random string generator instead of poor JSON
hatemhelal Mar 13, 2019
2c8fa7e
revert incorrect changes to PlainByteArrayDecoder::DecodeArrow method
Mar 13, 2019
5bc933b
use PutSpaced in test setup to correctly initialize encoded data
Mar 13, 2019
2026b51
Rework ByteArrayDecoder interface to reduce code duplication
Mar 13, 2019
99e9dee
Removed dependencies on arrow builder in parquet/encoding
Mar 13, 2019
023c022
Add virtual destructor to WrappedBuilderInterface
Mar 13, 2019
f644fff
Move schema fix logic to post-processing step
hatemhelal Mar 14, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions cpp/src/arrow/testing/random.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ std::shared_ptr<Array> RandomArrayGenerator::Boolean(int64_t size, double probab
// only calls the GenerateBitmap method.
using GenOpt = GenerateOptions<int, std::uniform_int_distribution<int>>;

std::vector<std::shared_ptr<Buffer>> buffers{2};
BufferVector buffers{2};
// Need 2 distinct generators such that probabilities are not shared.
GenOpt value_gen(seed(), 0, 1, probability);
GenOpt null_gen(seed(), 0, 1, null_probability);
Expand All @@ -100,7 +100,7 @@ static std::shared_ptr<NumericArray<ArrowType>> GenerateNumericArray(int64_t siz
OptionType options) {
using CType = typename ArrowType::c_type;
auto type = TypeTraits<ArrowType>::type_singleton();
std::vector<std::shared_ptr<Buffer>> buffers{2};
BufferVector buffers{2};

int64_t null_count = 0;
ABORT_NOT_OK(AllocateEmptyBitmap(size, &buffers[0]));
Expand Down Expand Up @@ -145,5 +145,64 @@ PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType)
#undef PRIMITIVE_RAND_FLOAT_IMPL
#undef PRIMITIVE_RAND_IMPL

std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size,
int32_t min_length,
int32_t max_length,
double null_probability) {
if (null_probability < 0 || null_probability > 1) {
ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1"));
}

auto int32_lengths = Int32(size, min_length, max_length, null_probability);
auto lengths = std::dynamic_pointer_cast<Int32Array>(int32_lengths);

// Visual Studio does not implement uniform_int_distribution for char types.
using GenOpt = GenerateOptions<uint8_t, std::uniform_int_distribution<uint16_t>>;
GenOpt options(seed(), static_cast<uint8_t>('A'), static_cast<uint8_t>('z'),
/*null_probability=*/0);

std::vector<uint8_t> str_buffer(max_length);
StringBuilder builder;

for (int64_t i = 0; i < size; ++i) {
if (lengths->IsValid(i)) {
options.GenerateData(str_buffer.data(), lengths->Value(i));
ABORT_NOT_OK(builder.Append(str_buffer.data(), lengths->Value(i)));
} else {
ABORT_NOT_OK(builder.AppendNull());
}
}

std::shared_ptr<arrow::Array> result;
ABORT_NOT_OK(builder.Finish(&result));
return result;
}

std::shared_ptr<arrow::Array> RandomArrayGenerator::StringWithRepeats(
int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
double null_probability) {
// Generate a random string dictionary without any nulls
auto array = String(unique, min_length, max_length, /*null_probability=*/0);
auto dictionary = std::dynamic_pointer_cast<StringArray>(array);

// Generate random indices to sample the dictionary with
auto id_array = Int64(size, 0, unique - 1, null_probability);
auto indices = std::dynamic_pointer_cast<Int64Array>(id_array);
StringBuilder builder;

for (int64_t i = 0; i < size; ++i) {
if (indices->IsValid(i)) {
const auto index = indices->Value(i);
const auto value = dictionary->GetView(index);
ABORT_NOT_OK(builder.Append(value));
} else {
ABORT_NOT_OK(builder.AppendNull());
}
}

std::shared_ptr<Array> result;
ABORT_NOT_OK(builder.Finish(&result));
return result;
}
} // namespace random
} // namespace arrow
29 changes: 29 additions & 0 deletions cpp/src/arrow/testing/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,35 @@ class ARROW_EXPORT RandomArrayGenerator {
}
}

/// \brief Generates a random StringArray
///
/// \param[in] size the size of the array to generate
/// \param[in] min_length the lower bound of the string length
/// determined by the uniform distribution
/// \param[in] max_length the upper bound of the string length
/// determined by the uniform distribution
/// \param[in] null_probability the probability of a row being null
///
/// \return a generated Array
std::shared_ptr<arrow::Array> String(int64_t size, int32_t min_length,
int32_t max_length, double null_probability);

/// \brief Generates a random StringArray with repeated values
///
/// \param[in] size the size of the array to generate
/// \param[in] unique the number of unique string values used
/// to populate the array
/// \param[in] min_length the lower bound of the string length
/// determined by the uniform distribution
/// \param[in] max_length the upper bound of the string length
/// determined by the uniform distribution
/// \param[in] null_probability the probability of a row being null
///
/// \return a generated Array
std::shared_ptr<arrow::Array> StringWithRepeats(int64_t size, int64_t unique,
int32_t min_length, int32_t max_length,
double null_probability);

private:
SeedType seed() { return seed_distribution_(seed_rng_); }

Expand Down
75 changes: 75 additions & 0 deletions cpp/src/parquet/arrow/arrow-reader-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <vector>

#include "arrow/api.h"
#include "arrow/testing/random.h"
#include "arrow/testing/util.h"
#include "arrow/type_traits.h"
#include "arrow/util/decimal.h"
Expand Down Expand Up @@ -2439,6 +2440,80 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) {
ASSERT_RAISES(Invalid, writer->WriteTable(*tbl, 1));
}

// ----------------------------------------------------------------------
// Tests for directly reading DictionaryArray
class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
public:
void SetUp() override {
GenerateData(GetParam());
ASSERT_NO_FATAL_FAILURE(
WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 2,
default_arrow_writer_properties(), &buffer_));

properties_ = default_arrow_reader_properties();
}

void GenerateData(double null_probability) {
constexpr int num_unique = 100;
constexpr int repeat = 10;
constexpr int64_t min_length = 2;
constexpr int64_t max_length = 10;
::arrow::random::RandomArrayGenerator rag(0);
auto dense_array = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length,
max_length, null_probability);
expected_dense_ = MakeSimpleTable(dense_array, /*nullable=*/true);

::arrow::StringDictionaryBuilder builder(default_memory_pool());
const auto& string_array = static_cast<const ::arrow::StringArray&>(*dense_array);
ASSERT_OK(builder.AppendArray(string_array));

std::shared_ptr<::arrow::Array> dict_array;
ASSERT_OK(builder.Finish(&dict_array));
expected_dict_ = MakeSimpleTable(dict_array, /*nullable=*/true);

// TODO(hatemhelal): Figure out if we can use the following to init the expected_dict_
// Currently fails due to DataType mismatch for indices array.
// Datum out;
// FunctionContext ctx(default_memory_pool());
// ASSERT_OK(DictionaryEncode(&ctx, Datum(dense_array), &out));
// expected_dict_ = MakeSimpleTable(out.make_array(), /*nullable=*/true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Swapping in the cast implementation for setting up the expected_dict_ table results in an a test assertion failure in AssertTablesEqual. I used a debugger to work out that the trouble is the dictionary generated by the cast has indices with type arrow::Int32Type while the builder generated one has indices with type arrow::Int8Type.

I don't know if this is actually an artifact of the trivial test cases that I used?

}

void TearDown() override {}

void CheckReadWholeFile(const Table& expected) {
std::unique_ptr<FileReader> reader;
ASSERT_OK_NO_THROW(OpenFile(std::make_shared<BufferReader>(buffer_),
::arrow::default_memory_pool(), properties_, &reader));

std::shared_ptr<Table> actual;
ASSERT_OK_NO_THROW(reader->ReadTable(&actual));
::arrow::AssertTablesEqual(*actual, expected, /*same_chunk_layout=*/false);
}

static std::vector<double> null_probabilites() { return {0.0, 0.5, 1}; }

protected:
std::shared_ptr<Table> expected_dense_;
std::shared_ptr<Table> expected_dict_;
std::shared_ptr<Buffer> buffer_;
ArrowReaderProperties properties_;
};

TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
properties_.set_read_dictionary(0, true);
CheckReadWholeFile(*expected_dict_);
}

TEST_P(TestArrowReadDictionary, ReadWholeFileDense) {
properties_.set_read_dictionary(0, false);
CheckReadWholeFile(*expected_dense_);
}

INSTANTIATE_TEST_CASE_P(
ReadDictionary, TestArrowReadDictionary,
::testing::ValuesIn(TestArrowReadDictionary::null_probabilites()));

} // namespace arrow

} // namespace parquet
Loading