From 5f8d678ce8308f3447349296955935e61ac6a17f Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 13:36:04 -0400 Subject: [PATCH 1/7] Reimplement featherread using arrow.internal.io.feather.Reader. --- matlab/src/matlab/featherread.m | 87 +++++++++++++-------------------- 1 file changed, 34 insertions(+), 53 deletions(-) diff --git a/matlab/src/matlab/featherread.m b/matlab/src/matlab/featherread.m index 5f4952873f9..833cbb4e657 100644 --- a/matlab/src/matlab/featherread.m +++ b/matlab/src/matlab/featherread.m @@ -23,64 +23,45 @@ % specific language governing permissions and limitations % under the License. -import arrow.util.*; - -% Validate input arguments. -narginchk(1, 1); -filename = convertStringsToChars(filename); -if ~ischar(filename) - error('MATLAB:arrow:InvalidFilenameDatatype', ... - 'Filename must be a character vector or string scalar.'); -end + arguments + filename(1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} + end -% FOPEN can be used to search for files without an extension on the MATLAB -% path. -fid = fopen(filename); -if fid ~= -1 - filename = fopen(fid); - fclose(fid); -else - error('MATLAB:arrow:UnableToOpenFile', ... - 'Unable to open file %s.', filename); -end + typesToCast = [arrow.type.ID.UInt8, ... + arrow.type.ID.UInt16, ... + arrow.type.ID.UInt32, ... + arrow.type.ID.UInt64, ... + arrow.type.ID.Int8, ... + arrow.type.ID.Int16, ... + arrow.type.ID.Int32, ... + arrow.type.ID.Int64, ... + arrow.type.ID.Float32, ... + arrow.type.ID.Float64, ... + arrow.type.ID.Boolean]; -% Read table variables and metadata from the given Feather file using -% libarrow. -[variables, metadata] = arrow.cpp.call('featherread', filename); + reader = arrow.internal.io.feather.Reader(filename); + recordBatch = reader.read(); -% Make valid MATLAB table variable names out of any of the -% Feather table column names that are not valid MATLAB table -% variable names. -[variableNames, variableDescriptions] = makeValidMATLABTableVariableNames({variables.Name}); + % Convert RecordBatch to a MATLAB table. + t = table(recordBatch); -% Iterate over each table variable, handling invalid (null) entries -% and invalid MATLAB table variable names appropriately. -% Note: All Arrow arrays can have an associated validity (null) bitmap. -% The Apache Arrow specification defines 0 (false) to represent an -% invalid (null) array entry and 1 (true) to represent a valid -% (non-null) array entry. -for ii = 1:length(variables) - if ~all(variables(ii).Valid) - switch variables(ii).Type - case {'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64'} - % MATLAB does not support missing values for integer types, so - % cast to double and set missing values to NaN in this case. - variables(ii).Data = double(variables(ii).Data); + % Cast columns with integer or boolean type containing null values + % to MATLAB double type. Substitute null values with NaN. + for ii = 1:recordBatch.NumColumns + array = recordBatch.column(ii); + type = array.Type.ID; + if any(type == typesToCast) + % Cast to double. + t{:, ii} = double(t{:, ii}); + % Substitute null values with NaN. + t{~array.Valid, ii} = NaN; end - - % Set invalid (null) entries to the appropriate MATLAB missing value using - % logical indexing. - variables(ii).Data(~variables(ii).Valid) = missing; end -end -% Construct a MATLAB table from the Feather file data. -t = table(variables.Data, 'VariableNames', cellstr(variableNames)); - -% Store original Feather table column names in the table.Properties.VariableDescriptions -% property if they were modified to be valid MATLAB table variable names. -if ~isempty(variableDescriptions) - t.Properties.VariableDescriptions = cellstr(variableDescriptions); -end + % Store original Feather table column names in the table.Properties.VariableDescriptions + % property if they were modified to be valid MATLAB table variable names. + if ~all(t.Properties.VariableNames == recordBatch.ColumnNames) + t.Properties.VariableDescriptions = recordBatch.ColumnNames; + end -end +end \ No newline at end of file From 677890dbb3ed54a7ff60da50df6f5559ff9df9c1 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 15:05:24 -0400 Subject: [PATCH 2/7] Use CombineChunksToBatch to avoid nullptr dereference. --- matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc index a264d24ecb1..db33b410f7b 100644 --- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc @@ -72,10 +72,8 @@ namespace arrow::matlab::io::feather::proxy { std::shared_ptr table = nullptr; MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(reader->Read(&table), context, error::FEATHER_FAILED_TO_READ_TABLE); - // Get the first RecordBatch from the Table. - arrow::TableBatchReader table_batch_reader{table}; - std::shared_ptr record_batch = nullptr; - MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(table_batch_reader.ReadNext(&record_batch), context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH); + // Combine all the chunks of the Table into a single RecordBatch. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto record_batch, table->CombineChunksToBatch(arrow::default_memory_pool()), context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH); // Create a Proxy from the first RecordBatch. auto record_batch_proxy = std::make_shared(record_batch); From 8fb1d484213ddad0f6075d553aba14f5eea76405 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 15:05:53 -0400 Subject: [PATCH 3/7] Update error messages in Feather tests. --- matlab/test/tfeather.m | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/matlab/test/tfeather.m b/matlab/test/tfeather.m index e4c988e1dda..20d69368d71 100755 --- a/matlab/test/tfeather.m +++ b/matlab/test/tfeather.m @@ -143,7 +143,7 @@ function zeroByNTable(testCase) function ErrorIfUnableToOpenFile(testCase) filename = fullfile(pwd, 'temp.feather'); - testCase.verifyError(@() featherread(filename), 'MATLAB:arrow:UnableToOpenFile'); + testCase.verifyError(@() featherread(filename), 'arrow:io:FailedToOpenFileForRead'); end function ErrorIfCorruptedFeatherFile(testCase) @@ -156,16 +156,13 @@ function ErrorIfCorruptedFeatherFile(testCase) fwrite(fileID, [1; 5]); fclose(fileID); - testCase.verifyError(@() featherread(filename), 'MATLAB:arrow:status:Invalid'); + testCase.verifyError(@() featherread(filename), 'arrow:io:feather:FailedToCreateReader'); end function ErrorIfInvalidFilenameDatatype(testCase) - filename = fullfile(pwd, 'temp.feather'); - t = createTable; testCase.verifyError(@() featherwrite({table}, t), 'MATLAB:validation:UnableToConvert'); - testCase.verifyError(@() featherread({filename}), 'MATLAB:arrow:InvalidFilenameDatatype'); end function ErrorIfTooManyInputs(testCase) @@ -179,7 +176,7 @@ function ErrorIfTooManyInputs(testCase) function ErrorIfTooFewInputs(testCase) testCase.verifyError(@() featherwrite(), 'MATLAB:minrhs'); - testCase.verifyError(@() featherread(), 'MATLAB:narginchk:notEnoughInputs'); + testCase.verifyError(@() featherread(), 'MATLAB:minrhs'); end function ErrorIfMultiColVarExist(testCase) From 16653eca9cbc8cfce3c0772d9e9e73513be3c183 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 15:42:19 -0400 Subject: [PATCH 4/7] Fix failing invalid variable name Feather test. --- matlab/src/matlab/featherread.m | 6 ++++-- matlab/test/tfeathermex.m | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/matlab/src/matlab/featherread.m b/matlab/src/matlab/featherread.m index 833cbb4e657..3995ba05c71 100644 --- a/matlab/src/matlab/featherread.m +++ b/matlab/src/matlab/featherread.m @@ -60,8 +60,10 @@ % Store original Feather table column names in the table.Properties.VariableDescriptions % property if they were modified to be valid MATLAB table variable names. - if ~all(t.Properties.VariableNames == recordBatch.ColumnNames) - t.Properties.VariableDescriptions = recordBatch.ColumnNames; + modifiedColumnNameIndices = t.Properties.VariableNames ~= recordBatch.ColumnNames; + if any(modifiedColumnNameIndices) + originalColumnNames = recordBatch.ColumnNames(modifiedColumnNameIndices); + t.Properties.VariableDescriptions(modifiedColumnNameIndices) = compose("Original variable name: '%s'", originalColumnNames); end end \ No newline at end of file diff --git a/matlab/test/tfeathermex.m b/matlab/test/tfeathermex.m index c0620e9054b..8b6b87aab10 100644 --- a/matlab/test/tfeathermex.m +++ b/matlab/test/tfeathermex.m @@ -50,17 +50,17 @@ function InvalidMATLABTableVariableNames(testCase) filename = fullfile(pwd, 'temp.feather'); % Create a table with an invalid MATLAB table variable name. - invalidVariable = arrow.util.createVariableStruct('double', 1, true, '@'); + invalidVariable = arrow.util.createVariableStruct('double', 1, true, ':'); validVariable = arrow.util.createVariableStruct('double', 1, true, 'Valid'); variables = [invalidVariable, validVariable]; metadata = arrow.util.createMetadataStruct(1, 2); arrow.cpp.call('featherwrite', filename, variables, metadata); t = featherread(filename); - testCase.verifyEqual(t.Properties.VariableNames{1}, 'x_'); + testCase.verifyEqual(t.Properties.VariableNames{1}, ':_1'); testCase.verifyEqual(t.Properties.VariableNames{2}, 'Valid'); - testCase.verifyEqual(t.Properties.VariableDescriptions{1}, 'Original variable name: ''@'''); + testCase.verifyEqual(t.Properties.VariableDescriptions{1}, 'Original variable name: '':'''); testCase.verifyEqual(t.Properties.VariableDescriptions{2}, ''); end end From f2e6ae1dace17b8c93ee174a42b192f1179ad202 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 16:16:45 -0400 Subject: [PATCH 5/7] Use appropriate table indexing syntax when casting to double. --- matlab/src/matlab/featherread.m | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/matlab/src/matlab/featherread.m b/matlab/src/matlab/featherread.m index 3995ba05c71..f9390cb87a5 100644 --- a/matlab/src/matlab/featherread.m +++ b/matlab/src/matlab/featherread.m @@ -35,8 +35,6 @@ arrow.type.ID.Int16, ... arrow.type.ID.Int32, ... arrow.type.ID.Int64, ... - arrow.type.ID.Float32, ... - arrow.type.ID.Float64, ... arrow.type.ID.Boolean]; reader = arrow.internal.io.feather.Reader(filename); @@ -50,9 +48,9 @@ for ii = 1:recordBatch.NumColumns array = recordBatch.column(ii); type = array.Type.ID; - if any(type == typesToCast) + if any(type == typesToCast) && any(~array.Valid) % Cast to double. - t{:, ii} = double(t{:, ii}); + t.(ii) = double(t.(ii)); % Substitute null values with NaN. t{~array.Valid, ii} = NaN; end From 6b8ca40d247b282a14d113a9bc8852f539b10962 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 16:27:38 -0400 Subject: [PATCH 6/7] Add tests for round-tripping all supported MATLAB types and for round-tripping tables with Unicode variable names. --- matlab/test/tfeather.m | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/matlab/test/tfeather.m b/matlab/test/tfeather.m index 20d69368d71..8cdbef27fad 100755 --- a/matlab/test/tfeather.m +++ b/matlab/test/tfeather.m @@ -215,5 +215,41 @@ function NumericComplexUnsupported(testCase) testCase.verifyError(@() featherwrite(filename, actualTable) ,'arrow:array:ComplexNumeric'); end + + function SupportedTypes(testCase) + filename = fullfile(pwd, 'temp.feather'); + + % Create a table with all supported MATLAB types. + expectedTable = table(int8 ([1, 2, 3]'), ... + int16 ([1, 2, 3]'), ... + int32 ([1, 2, 3]'), ... + int64 ([1, 2, 3]'), ... + uint8 ([1, 2, 3]'), ... + uint16 ([1, 2, 3]'), ... + uint32 ([1, 2, 3]'), ... + uint64 ([1, 2, 3]'), ... + logical([1, 0, 1]'), ... + single ([1, 2, 3]'), ... + double ([1, 2, 3]'), ... + string (["A", "B", "C"]'), ... + datetime(2023, 6, 28) + days(0:2)'); + + actualTable = featherRoundTrip(filename, expectedTable); + testCase.verifyEqual(actualTable, expectedTable); + end + + function UnicodeVariableNames(testCase) + filename = fullfile(pwd, 'temp.feather'); + + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + columnNames = [smiley, tree, mango]; + expectedTable = table(1, 2, 3, VariableNames=columnNames); + + actualTable = featherRoundTrip(filename, expectedTable); + testCase.verifyEqual(actualTable, expectedTable); + end + end end From 8d2858029ca6d9ed50501d10cd6a16f087e7d270 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 14 Aug 2023 16:47:31 -0400 Subject: [PATCH 7/7] Update comment based on code review feedback. --- matlab/src/matlab/featherread.m | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/matlab/src/matlab/featherread.m b/matlab/src/matlab/featherread.m index f9390cb87a5..736fe832888 100644 --- a/matlab/src/matlab/featherread.m +++ b/matlab/src/matlab/featherread.m @@ -43,8 +43,8 @@ % Convert RecordBatch to a MATLAB table. t = table(recordBatch); - % Cast columns with integer or boolean type containing null values - % to MATLAB double type. Substitute null values with NaN. + % Cast integer and boolean columns containing null values + % to double. Substitute null values with NaN. for ii = 1:recordBatch.NumColumns array = recordBatch.column(ii); type = array.Type.ID; @@ -64,4 +64,4 @@ t.Properties.VariableDescriptions(modifiedColumnNameIndices) = compose("Original variable name: '%s'", originalColumnNames); end -end \ No newline at end of file +end