diff --git a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc index a264d24ecb1..db33b410f7b 100644 --- a/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc +++ b/matlab/src/cpp/arrow/matlab/io/feather/proxy/reader.cc @@ -72,10 +72,8 @@ namespace arrow::matlab::io::feather::proxy { std::shared_ptr table = nullptr; MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(reader->Read(&table), context, error::FEATHER_FAILED_TO_READ_TABLE); - // Get the first RecordBatch from the Table. - arrow::TableBatchReader table_batch_reader{table}; - std::shared_ptr record_batch = nullptr; - MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(table_batch_reader.ReadNext(&record_batch), context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH); + // Combine all the chunks of the Table into a single RecordBatch. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto record_batch, table->CombineChunksToBatch(arrow::default_memory_pool()), context, error::FEATHER_FAILED_TO_READ_RECORD_BATCH); // Create a Proxy from the first RecordBatch. auto record_batch_proxy = std::make_shared(record_batch); diff --git a/matlab/src/matlab/featherread.m b/matlab/src/matlab/featherread.m index 5f4952873f9..736fe832888 100644 --- a/matlab/src/matlab/featherread.m +++ b/matlab/src/matlab/featherread.m @@ -23,64 +23,45 @@ % specific language governing permissions and limitations % under the License. -import arrow.util.*; + arguments + filename(1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} + end -% Validate input arguments. -narginchk(1, 1); -filename = convertStringsToChars(filename); -if ~ischar(filename) - error('MATLAB:arrow:InvalidFilenameDatatype', ... - 'Filename must be a character vector or string scalar.'); -end + typesToCast = [arrow.type.ID.UInt8, ... + arrow.type.ID.UInt16, ... + arrow.type.ID.UInt32, ... + arrow.type.ID.UInt64, ... + arrow.type.ID.Int8, ... + arrow.type.ID.Int16, ... + arrow.type.ID.Int32, ... + arrow.type.ID.Int64, ... + arrow.type.ID.Boolean]; -% FOPEN can be used to search for files without an extension on the MATLAB -% path. -fid = fopen(filename); -if fid ~= -1 - filename = fopen(fid); - fclose(fid); -else - error('MATLAB:arrow:UnableToOpenFile', ... - 'Unable to open file %s.', filename); -end + reader = arrow.internal.io.feather.Reader(filename); + recordBatch = reader.read(); -% Read table variables and metadata from the given Feather file using -% libarrow. -[variables, metadata] = arrow.cpp.call('featherread', filename); + % Convert RecordBatch to a MATLAB table. + t = table(recordBatch); -% Make valid MATLAB table variable names out of any of the -% Feather table column names that are not valid MATLAB table -% variable names. -[variableNames, variableDescriptions] = makeValidMATLABTableVariableNames({variables.Name}); - -% Iterate over each table variable, handling invalid (null) entries -% and invalid MATLAB table variable names appropriately. -% Note: All Arrow arrays can have an associated validity (null) bitmap. -% The Apache Arrow specification defines 0 (false) to represent an -% invalid (null) array entry and 1 (true) to represent a valid -% (non-null) array entry. -for ii = 1:length(variables) - if ~all(variables(ii).Valid) - switch variables(ii).Type - case {'uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64'} - % MATLAB does not support missing values for integer types, so - % cast to double and set missing values to NaN in this case. - variables(ii).Data = double(variables(ii).Data); + % Cast integer and boolean columns containing null values + % to double. Substitute null values with NaN. + for ii = 1:recordBatch.NumColumns + array = recordBatch.column(ii); + type = array.Type.ID; + if any(type == typesToCast) && any(~array.Valid) + % Cast to double. + t.(ii) = double(t.(ii)); + % Substitute null values with NaN. + t{~array.Valid, ii} = NaN; end - - % Set invalid (null) entries to the appropriate MATLAB missing value using - % logical indexing. - variables(ii).Data(~variables(ii).Valid) = missing; end -end - -% Construct a MATLAB table from the Feather file data. -t = table(variables.Data, 'VariableNames', cellstr(variableNames)); -% Store original Feather table column names in the table.Properties.VariableDescriptions -% property if they were modified to be valid MATLAB table variable names. -if ~isempty(variableDescriptions) - t.Properties.VariableDescriptions = cellstr(variableDescriptions); -end + % Store original Feather table column names in the table.Properties.VariableDescriptions + % property if they were modified to be valid MATLAB table variable names. + modifiedColumnNameIndices = t.Properties.VariableNames ~= recordBatch.ColumnNames; + if any(modifiedColumnNameIndices) + originalColumnNames = recordBatch.ColumnNames(modifiedColumnNameIndices); + t.Properties.VariableDescriptions(modifiedColumnNameIndices) = compose("Original variable name: '%s'", originalColumnNames); + end end diff --git a/matlab/test/tfeather.m b/matlab/test/tfeather.m index e4c988e1dda..8cdbef27fad 100755 --- a/matlab/test/tfeather.m +++ b/matlab/test/tfeather.m @@ -143,7 +143,7 @@ function zeroByNTable(testCase) function ErrorIfUnableToOpenFile(testCase) filename = fullfile(pwd, 'temp.feather'); - testCase.verifyError(@() featherread(filename), 'MATLAB:arrow:UnableToOpenFile'); + testCase.verifyError(@() featherread(filename), 'arrow:io:FailedToOpenFileForRead'); end function ErrorIfCorruptedFeatherFile(testCase) @@ -156,16 +156,13 @@ function ErrorIfCorruptedFeatherFile(testCase) fwrite(fileID, [1; 5]); fclose(fileID); - testCase.verifyError(@() featherread(filename), 'MATLAB:arrow:status:Invalid'); + testCase.verifyError(@() featherread(filename), 'arrow:io:feather:FailedToCreateReader'); end function ErrorIfInvalidFilenameDatatype(testCase) - filename = fullfile(pwd, 'temp.feather'); - t = createTable; testCase.verifyError(@() featherwrite({table}, t), 'MATLAB:validation:UnableToConvert'); - testCase.verifyError(@() featherread({filename}), 'MATLAB:arrow:InvalidFilenameDatatype'); end function ErrorIfTooManyInputs(testCase) @@ -179,7 +176,7 @@ function ErrorIfTooManyInputs(testCase) function ErrorIfTooFewInputs(testCase) testCase.verifyError(@() featherwrite(), 'MATLAB:minrhs'); - testCase.verifyError(@() featherread(), 'MATLAB:narginchk:notEnoughInputs'); + testCase.verifyError(@() featherread(), 'MATLAB:minrhs'); end function ErrorIfMultiColVarExist(testCase) @@ -218,5 +215,41 @@ function NumericComplexUnsupported(testCase) testCase.verifyError(@() featherwrite(filename, actualTable) ,'arrow:array:ComplexNumeric'); end + + function SupportedTypes(testCase) + filename = fullfile(pwd, 'temp.feather'); + + % Create a table with all supported MATLAB types. + expectedTable = table(int8 ([1, 2, 3]'), ... + int16 ([1, 2, 3]'), ... + int32 ([1, 2, 3]'), ... + int64 ([1, 2, 3]'), ... + uint8 ([1, 2, 3]'), ... + uint16 ([1, 2, 3]'), ... + uint32 ([1, 2, 3]'), ... + uint64 ([1, 2, 3]'), ... + logical([1, 0, 1]'), ... + single ([1, 2, 3]'), ... + double ([1, 2, 3]'), ... + string (["A", "B", "C"]'), ... + datetime(2023, 6, 28) + days(0:2)'); + + actualTable = featherRoundTrip(filename, expectedTable); + testCase.verifyEqual(actualTable, expectedTable); + end + + function UnicodeVariableNames(testCase) + filename = fullfile(pwd, 'temp.feather'); + + smiley = "😀"; + tree = "🌲"; + mango = "🥭"; + columnNames = [smiley, tree, mango]; + expectedTable = table(1, 2, 3, VariableNames=columnNames); + + actualTable = featherRoundTrip(filename, expectedTable); + testCase.verifyEqual(actualTable, expectedTable); + end + end end diff --git a/matlab/test/tfeathermex.m b/matlab/test/tfeathermex.m index c0620e9054b..8b6b87aab10 100644 --- a/matlab/test/tfeathermex.m +++ b/matlab/test/tfeathermex.m @@ -50,17 +50,17 @@ function InvalidMATLABTableVariableNames(testCase) filename = fullfile(pwd, 'temp.feather'); % Create a table with an invalid MATLAB table variable name. - invalidVariable = arrow.util.createVariableStruct('double', 1, true, '@'); + invalidVariable = arrow.util.createVariableStruct('double', 1, true, ':'); validVariable = arrow.util.createVariableStruct('double', 1, true, 'Valid'); variables = [invalidVariable, validVariable]; metadata = arrow.util.createMetadataStruct(1, 2); arrow.cpp.call('featherwrite', filename, variables, metadata); t = featherread(filename); - testCase.verifyEqual(t.Properties.VariableNames{1}, 'x_'); + testCase.verifyEqual(t.Properties.VariableNames{1}, ':_1'); testCase.verifyEqual(t.Properties.VariableNames{2}, 'Valid'); - testCase.verifyEqual(t.Properties.VariableDescriptions{1}, 'Original variable name: ''@'''); + testCase.verifyEqual(t.Properties.VariableDescriptions{1}, 'Original variable name: '':'''); testCase.verifyEqual(t.Properties.VariableDescriptions{2}, ''); end end