From d375d908c7858d04b410750b9d5165e163c3609e Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Tue, 19 May 2020 19:49:44 -0700 Subject: [PATCH 01/10] Added decimal marker option in TextLoader --- .../DataLoadSave/Text/TextLoader.cs | 21 ++- .../Text/TextLoaderSaverCatalog.cs | 10 +- .../Common/EntryPoints/core_manifest.json | 12 ++ test/data/iris_decimal_marker_as_comma.txt | 151 ++++++++++++++++++ 4 files changed, 190 insertions(+), 4 deletions(-) create mode 100644 test/data/iris_decimal_marker_as_comma.txt diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 7ea6ab17e9..0d51a16be2 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -474,6 +474,12 @@ public class Options [Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")] public char[] Separators = new[] { Defaults.Separator }; + /// + /// The character that should be used as the decimal marker. + /// + [Argument(ArgumentType.AtMostOnce, Name = "Decimal Marker", HelpText = "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.", ShortName = "decimal")] + public char DecimalMarker = Defaults.DecimalMarker; + /// /// Specifies the input columns that should be mapped to columns. /// @@ -535,6 +541,7 @@ internal static class Defaults internal const bool AllowQuoting = false; internal const bool AllowSparse = false; internal const char Separator = '\t'; + internal const char DecimalMarker = '.'; internal const bool HasHeader = false; internal const bool TrimWhitespace = false; internal const bool ReadMultilines = false; @@ -1063,7 +1070,8 @@ private static VersionInfo GetVersionInfo() // verWrittenCur: 0x00010009, // Introduced _flags //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present - verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags + //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags + verWrittenCur: 0x0001000D, // Added decimal marker option to allow for ',' to be a decimal marker verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1094,6 +1102,7 @@ private enum OptionFlags : uint // Input size is zero for unknown - determined by the data (including sparse rows). private readonly int _inputSize; private readonly char[] _separators; + private readonly char _decimalMarker; private readonly Bindings _bindings; private readonly Parser _parser; @@ -1210,6 +1219,9 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo } } + if (options.DecimalMarker == ',' && _separators.Contains(',')) + throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker); + _decimalMarker = options.DecimalMarker; _bindings = new Bindings(this, cols, headerFile, dataSample); _parser = new Parser(this); } @@ -1373,6 +1385,7 @@ private TextLoader(IHost host, ModelLoadContext ctx) // int: inputSize: 0 for determined from data // int: number of separators // char[]: separators + // char: decimal marker // bindings int cbFloat = ctx.Reader.ReadInt32(); host.CheckDecode(cbFloat == sizeof(float)); @@ -1397,6 +1410,8 @@ private TextLoader(IHost host, ModelLoadContext ctx) if (_separators.Contains(':')) host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0); + _decimalMarker = ctx.Reader.ReadChar(); + host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ','); _bindings = new Bindings(ctx, this); _parser = new Parser(this); } @@ -1437,6 +1452,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx) // int: inputSize: 0 for determined from data // int: number of separators // char[]: separators + // char: decimal marker // bindings ctx.Writer.Write(sizeof(float)); ctx.Writer.Write(_maxRows); @@ -1445,6 +1461,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx) _host.Assert(0 <= _inputSize && _inputSize < SrcLim); ctx.Writer.Write(_inputSize); ctx.Writer.WriteCharArray(_separators); + ctx.Writer.Write(_decimalMarker); _bindings.Save(ctx); } @@ -1470,6 +1487,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx) internal static TextLoader CreateTextLoader(IHostEnvironment host, bool hasHeader = Defaults.HasHeader, char separator = Defaults.Separator, + char decimalMarker = Defaults.DecimalMarker, bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, bool trimWhitespace = Defaults.TrimWhitespace, @@ -1479,6 +1497,7 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host, { HasHeader = hasHeader, Separators = new[] { separator }, + DecimalMarker = decimalMarker, AllowQuoting = allowQuoting, AllowSparse = supportSparse, TrimWhitespace = trimWhitespace diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 0fcc23fcef..d230f360ba 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -93,6 +93,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, /// names and their data types in the schema of the loaded data. /// The catalog. /// Column separator character. Default is '\t' + /// Decimal separator character. Default is '.' /// Whether the file has a header with feature names. When a is provided, /// indicates that the first line in the will be used for feature names, and that when /// is called, the first line will be skipped. When there is no provided, just indicates that the loader should @@ -111,13 +112,14 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, char separatorChar = TextLoader.Defaults.Separator, + char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, bool allowSparse = TextLoader.Defaults.AllowSparse) - => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, - allowSparse, trimWhitespace, dataSample: dataSample); + => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, decimalChar, + allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample); /// /// Create a text loader by inferencing the dataset schema from a data model type. @@ -221,6 +223,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str /// The catalog. /// The path to the file. /// Column separator character. Default is '\t' + /// Decimal separator character. Default is '.' /// Whether the file has a header. When , the loader will skip the first line when /// is called. /// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters @@ -240,6 +243,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, char separatorChar = TextLoader.Defaults.Separator, + char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, @@ -254,7 +258,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. return TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, - allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)); + decimalChar, allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)); } /// diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 67033afde3..4dc8b278c3 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -369,6 +369,18 @@ "\t" ] }, + { + "Name": "Decimal Marker", + "Type": "Char", + "Desc": "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.", + "Aliases": [ + "decimal" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "." + }, { "Name": "TrimWhitespace", "Type": "Bool", diff --git a/test/data/iris_decimal_marker_as_comma.txt b/test/data/iris_decimal_marker_as_comma.txt new file mode 100644 index 0000000000..d9f3b06b4a --- /dev/null +++ b/test/data/iris_decimal_marker_as_comma.txt @@ -0,0 +1,151 @@ +#Label Sepal length Sepal width Petal length Petal width +0 5,1 3,5 1,4 0,2 +0 4,9 3,0 1,4 0,2 +0 4,7 3,2 1,3 0,2 +0 4,6 3,1 1,5 0,2 +0 5,0 3,6 1,4 0,2 +0 5,4 3,9 1,7 0,4 +0 4,6 3,4 1,4 0,3 +0 5,0 3,4 1,5 0,2 +0 4,4 2,9 1,4 0,2 +0 4,9 3,1 1,5 0,1 +0 5,4 3,7 1,5 0,2 +0 4,8 3,4 1,6 0,2 +0 4,8 3,0 1,4 0,1 +0 4,3 3,0 1,1 0,1 +0 5,8 4,0 1,2 0,2 +0 5,7 4,4 1,5 0,4 +0 5,4 3,9 1,3 0,4 +0 5,1 3,5 1,4 0,3 +0 5,7 3,8 1,7 0,3 +0 5,1 3,8 1,5 0,3 +0 5,4 3,4 1,7 0,2 +0 5,1 3,7 1,5 0,4 +0 4,6 3,6 1,0 0,2 +0 5,1 3,3 1,7 0,5 +0 4,8 3,4 1,9 0,2 +0 5,0 3,0 1,6 0,2 +0 5,0 3,4 1,6 0,4 +0 5,2 3,5 1,5 0,2 +0 5,2 3,4 1,4 0,2 +0 4,7 3,2 1,6 0,2 +0 4,8 3,1 1,6 0,2 +0 5,4 3,4 1,5 0,4 +0 5,2 4,1 1,5 0,1 +0 5,5 4,2 1,4 0,2 +0 4,9 3,1 1,5 0,1 +0 5,0 3,2 1,2 0,2 +0 5,5 3,5 1,3 0,2 +0 4,9 3,1 1,5 0,1 +0 4,4 3,0 1,3 0,2 +0 5,1 3,4 1,5 0,2 +0 5,0 3,5 1,3 0,3 +0 4,5 2,3 1,3 0,3 +0 4,4 3,2 1,3 0,2 +0 5,0 3,5 1,6 0,6 +0 5,1 3,8 1,9 0,4 +0 4,8 3,0 1,4 0,3 +0 5,1 3,8 1,6 0,2 +0 4,6 3,2 1,4 0,2 +0 5,3 3,7 1,5 0,2 +0 5,0 3,3 1,4 0,2 +1 7,0 3,2 4,7 1,4 +1 6,4 3,2 4,5 1,5 +1 6,9 3,1 4,9 1,5 +1 5,5 2,3 4,0 1,3 +1 6,5 2,8 4,6 1,5 +1 5,7 2,8 4,5 1,3 +1 6,3 3,3 4,7 1,6 +1 4,9 2,4 3,3 1,0 +1 6,6 2,9 4,6 1,3 +1 5,2 2,7 3,9 1,4 +1 5,0 2,0 3,5 1,0 +1 5,9 3,0 4,2 1,5 +1 6,0 2,2 4,0 1,0 +1 6,1 2,9 4,7 1,4 +1 5,6 2,9 3,6 1,3 +1 6,7 3,1 4,4 1,4 +1 5,6 3,0 4,5 1,5 +1 5,8 2,7 4,1 1,0 +1 6,2 2,2 4,5 1,5 +1 5,6 2,5 3,9 1,1 +1 5,9 3,2 4,8 1,8 +1 6,1 2,8 4,0 1,3 +1 6,3 2,5 4,9 1,5 +1 6,1 2,8 4,7 1,2 +1 6,4 2,9 4,3 1,3 +1 6,6 3,0 4,4 1,4 +1 6,8 2,8 4,8 1,4 +1 6,7 3,0 5,0 1,7 +1 6,0 2,9 4,5 1,5 +1 5,7 2,6 3,5 1,0 +1 5,5 2,4 3,8 1,1 +1 5,5 2,4 3,7 1,0 +1 5,8 2,7 3,9 1,2 +1 6,0 2,7 5,1 1,6 +1 5,4 3,0 4,5 1,5 +1 6,0 3,4 4,5 1,6 +1 6,7 3,1 4,7 1,5 +1 6,3 2,3 4,4 1,3 +1 5,6 3,0 4,1 1,3 +1 5,5 2,5 4,0 1,3 +1 5,5 2,6 4,4 1,2 +1 6,1 3,0 4,6 1,4 +1 5,8 2,6 4,0 1,2 +1 5,0 2,3 3,3 1,0 +1 5,6 2,7 4,2 1,3 +1 5,7 3,0 4,2 1,2 +1 5,7 2,9 4,2 1,3 +1 6,2 2,9 4,3 1,3 +1 5,1 2,5 3,0 1,1 +1 5,7 2,8 4,1 1,3 +2 6,3 3,3 6,0 2,5 +2 5,8 2,7 5,1 1,9 +2 7,1 3,0 5,9 2,1 +2 6,3 2,9 5,6 1,8 +2 6,5 3,0 5,8 2,2 +2 7,6 3,0 6,6 2,1 +2 4,9 2,5 4,5 1,7 +2 7,3 2,9 6,3 1,8 +2 6,7 2,5 5,8 1,8 +2 7,2 3,6 6,1 2,5 +2 6,5 3,2 5,1 2,0 +2 6,4 2,7 5,3 1,9 +2 6,8 3,0 5,5 2,1 +2 5,7 2,5 5,0 2,0 +2 5,8 2,8 5,1 2,4 +2 6,4 3,2 5,3 2,3 +2 6,5 3,0 5,5 1,8 +2 7,7 3,8 6,7 2,2 +2 7,7 2,6 6,9 2,3 +2 6,0 2,2 5,0 1,5 +2 6,9 3,2 5,7 2,3 +2 5,6 2,8 4,9 2,0 +2 7,7 2,8 6,7 2,0 +2 6,3 2,7 4,9 1,8 +2 6,7 3,3 5,7 2,1 +2 7,2 3,2 6,0 1,8 +2 6,2 2,8 4,8 1,8 +2 6,1 3,0 4,9 1,8 +2 6,4 2,8 5,6 2,1 +2 7,2 3,0 5,8 1,6 +2 7,4 2,8 6,1 1,9 +2 7,9 3,8 6,4 2,0 +2 6,4 2,8 5,6 2,2 +2 6,3 2,8 5,1 1,5 +2 6,1 2,6 5,6 1,4 +2 7,7 3,0 6,1 2,3 +2 6,3 3,4 5,6 2,4 +2 6,4 3,1 5,5 1,8 +2 6,0 3,0 4,8 1,8 +2 6,9 3,1 5,4 2,1 +2 6,7 3,1 5,6 2,4 +2 6,9 3,1 5,1 2,3 +2 5,8 2,7 5,1 1,9 +2 6,8 3,2 5,9 2,3 +2 6,7 3,3 5,7 2,5 +2 6,7 3,0 5,2 2,3 +2 6,3 2,5 5,0 1,9 +2 6,5 3,0 5,2 2,0 +2 6,2 3,4 5,4 2,3 +2 5,9 3,0 5,1 1,8 From 544dab6122d84b763854231764636131ded75fbe Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Tue, 19 May 2020 21:11:18 -0700 Subject: [PATCH 02/10] Added decimalChar to more TextLoader constructors --- .../DataLoadSave/Text/TextLoaderSaverCatalog.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index d230f360ba..5471a2f6c9 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -21,6 +21,7 @@ public static class TextLoaderSaverCatalog /// The catalog. /// Array of columns defining the schema. /// The character used as separator between data points in a row. By default the tab character is used as separator. + /// Decimal separator character. Default is '.' /// Whether the file has a header with feature names. When a is provided, /// indicates that the first line in the will be used for feature names, and that when /// is called, the first line will be skipped. When there is no provided, just indicates that the loader should @@ -51,6 +52,7 @@ public static class TextLoaderSaverCatalog public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, + char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, @@ -61,6 +63,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, { Columns = columns, Separators = new[] { separatorChar }, + DecimalMarker = decimalChar, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, @@ -142,6 +145,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat /// The path to the file. /// The columns of the schema. /// The character used as separator between data points in a row. By default the tab character is used as separator. + /// Decimal separator character. Default is '.' /// Whether the file has a header. When , the loader will skip the first line when /// is called. /// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters @@ -162,6 +166,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, + char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, @@ -177,6 +182,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, { Columns = columns, Separators = new[] { separatorChar }, + DecimalMarker = decimalChar, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, From a8a9b54b0a05708f988db5366a20c11114fe410e Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Tue, 19 May 2020 22:31:35 -0700 Subject: [PATCH 03/10] Removed decimalMarker from TextLoader constructors due to API breaking --- .../DataLoadSave/Text/TextLoader.cs | 4 ++-- .../DataLoadSave/Text/TextLoaderSaverCatalog.cs | 16 +++------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 0d51a16be2..99cb5db826 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1487,11 +1487,11 @@ void ICanSaveModel.Save(ModelSaveContext ctx) internal static TextLoader CreateTextLoader(IHostEnvironment host, bool hasHeader = Defaults.HasHeader, char separator = Defaults.Separator, - char decimalMarker = Defaults.DecimalMarker, bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, bool trimWhitespace = Defaults.TrimWhitespace, - IMultiStreamSource dataSample = null) + IMultiStreamSource dataSample = null, + char decimalMarker = Defaults.DecimalMarker) { Options options = new Options { diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 5471a2f6c9..0fcc23fcef 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -21,7 +21,6 @@ public static class TextLoaderSaverCatalog /// The catalog. /// Array of columns defining the schema. /// The character used as separator between data points in a row. By default the tab character is used as separator. - /// Decimal separator character. Default is '.' /// Whether the file has a header with feature names. When a is provided, /// indicates that the first line in the will be used for feature names, and that when /// is called, the first line will be skipped. When there is no provided, just indicates that the loader should @@ -52,7 +51,6 @@ public static class TextLoaderSaverCatalog public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, - char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, @@ -63,7 +61,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, { Columns = columns, Separators = new[] { separatorChar }, - DecimalMarker = decimalChar, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, @@ -96,7 +93,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, /// names and their data types in the schema of the loaded data. /// The catalog. /// Column separator character. Default is '\t' - /// Decimal separator character. Default is '.' /// Whether the file has a header with feature names. When a is provided, /// indicates that the first line in the will be used for feature names, and that when /// is called, the first line will be skipped. When there is no provided, just indicates that the loader should @@ -115,14 +111,13 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, /// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column. public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, char separatorChar = TextLoader.Defaults.Separator, - char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, bool allowSparse = TextLoader.Defaults.AllowSparse) - => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, decimalChar, - allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample); + => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, + allowSparse, trimWhitespace, dataSample: dataSample); /// /// Create a text loader by inferencing the dataset schema from a data model type. @@ -145,7 +140,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat /// The path to the file. /// The columns of the schema. /// The character used as separator between data points in a row. By default the tab character is used as separator. - /// Decimal separator character. Default is '.' /// Whether the file has a header. When , the loader will skip the first line when /// is called. /// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters @@ -166,7 +160,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, - char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, @@ -182,7 +175,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, { Columns = columns, Separators = new[] { separatorChar }, - DecimalMarker = decimalChar, HasHeader = hasHeader, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace, @@ -229,7 +221,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str /// The catalog. /// The path to the file. /// Column separator character. Default is '\t' - /// Decimal separator character. Default is '.' /// Whether the file has a header. When , the loader will skip the first line when /// is called. /// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters @@ -249,7 +240,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, char separatorChar = TextLoader.Defaults.Separator, - char decimalChar = TextLoader.Defaults.DecimalMarker, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, @@ -264,7 +254,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. return TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, - decimalChar, allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)); + allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path)); } /// From 7658a70887bdda3c59bef9f5ae30b27175c00b9b Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Wed, 20 May 2020 14:08:28 -0700 Subject: [PATCH 04/10] Added unit test for ',' as a decimal marker, and added decimalMarker to TextLoaderCursor and TextLoaderParser --- .../Utilities/DoubleParser.cs | 12 ++++--- .../DataLoadSave/Text/TextLoader.cs | 13 ++++--- .../DataLoadSave/Text/TextLoaderCursor.cs | 12 ++++--- .../DataLoadSave/Text/TextLoaderParser.cs | 2 ++ test/Microsoft.ML.Tests/TextLoaderTests.cs | 35 +++++++++++++++++++ 5 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index bad57cdd3c..18e2c0fd46 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -527,6 +527,8 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool Contracts.Assert(num == 0); Contracts.Assert(exp == 0); + const char decimalMarker = '.'; + if (ich >= span.Length) return false; @@ -554,7 +556,7 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool return false; break; - case '.': + case decimalMarker: goto LPoint; // The common cases. @@ -571,7 +573,7 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool break; } - // Get digits before '.' + // Get digits before the decimal marker, which may be '.' or ',' uint d; for (; ; ) { @@ -593,14 +595,14 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool } Contracts.Assert(i < span.Length); - if (span[i] != '.') + if (span[i] != decimalMarker) goto LAfterDigits; LPoint: Contracts.Assert(i < span.Length); - Contracts.Assert(span[i] == '.'); + Contracts.Assert(span[i] == decimalMarker); - // Get the digits after '.' + // Get the digits after the decimal marker, which may be '.' or ',' for (; ; ) { if (++i >= span.Length) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 99cb5db826..0d0b2c179c 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -709,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines); + Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines); + Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines); + Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker); if (needInputSize && inputSize == 0) { @@ -1410,8 +1410,11 @@ private TextLoader(IHost host, ModelLoadContext ctx) if (_separators.Contains(':')) host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0); - _decimalMarker = ctx.Reader.ReadChar(); - host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ','); + if (ctx.Header.ModelVerWritten >= 0x0001000D) + { + _decimalMarker = ctx.Reader.ReadChar(); + host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ','); + } _bindings = new Bindings(ctx, this); _parser = new Parser(this); } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 62f5709169..f2ff6ec4a1 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1, parent._decimalMarker); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd, parent._decimalMarker); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; @@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter() }; } - public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines) + public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines, char decimalMarker) { Contracts.AssertValue(source); Contracts.Assert(count > 0); @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM count = 2; LineBatch batch; - var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1); + var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1, decimalMarker); try { batch = reader.GetBatch(); @@ -404,6 +404,7 @@ private sealed class LineReader private readonly bool _hasHeader; private readonly bool _readMultilines; private readonly char[] _separators; + private readonly char _decimalMarker; private readonly int _batchSize; private readonly IMultiStreamSource _files; @@ -413,7 +414,7 @@ private sealed class LineReader private Task _thdRead; private volatile bool _abort; - public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref) + public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref, char decimalMarker) { // Note that files is allowed to be empty. Contracts.AssertValue(files); @@ -430,6 +431,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has _separators = separators; _files = files; _cref = cref; + _decimalMarker = decimalMarker; _queue = new BlockingQueue(bufSize); _thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 13019c4bf2..c84cb1927c 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -633,6 +633,7 @@ public void Clear() } private readonly char[] _separators; + private readonly char _decimalMarker; private readonly OptionFlags _flags; private readonly int _inputSize; private readonly ColInfo[] _infos; @@ -683,6 +684,7 @@ public Parser(TextLoader parent) } _separators = parent._separators; + _decimalMarker = parent._decimalMarker; _flags = parent._flags; _inputSize = parent._inputSize; Contracts.Assert(_inputSize >= 0); diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index b2421bacce..e249808fda 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -803,6 +803,41 @@ public void TestTextLoaderKeyTypeBackCompat() } } + [Fact] + public void TestCommaAsDecimalMarker() + { + string dataPath = GetDataPath("iris_decimal_marker_as_comma.txt"); + + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed: 1); + var reader = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = ',' + }); + // Data + var textData = reader.Load(GetDataPath(dataPath)); + var data = mlContext.Data.Cache(mlContext.Transforms.Conversion.MapValueToKey("Label") + .Fit(textData).Transform(textData)); + + // Pipeline + var pipeline = mlContext.MulticlassClassification.Trainers.OneVersusAll( + mlContext.BinaryClassification.Trainers.LinearSvm(new Trainers.LinearSvmTrainer.Options { NumberOfIterations = 100 }), + useProbabilities: false); + + var model = pipeline.Fit(data); + var predictions = model.Transform(data); + + // Metrics + var metrics = mlContext.MulticlassClassification.Evaluate(predictions); + Assert.True(metrics.MicroAccuracy > 0.83); + } + private class IrisNoFields { } From ece551852a2804a9a70e5ddd1a2bc4b6b6139577 Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Wed, 20 May 2020 17:33:15 -0700 Subject: [PATCH 05/10] Added DecimalMarker in DoubleParser --- src/Microsoft.ML.Core/Utilities/DoubleParser.cs | 16 +++++++++++----- .../DataLoadSave/Text/TextLoader.cs | 10 ++++++---- .../DataLoadSave/Text/TextLoaderCursor.cs | 12 +++++------- .../DataLoadSave/Text/TextLoaderParser.cs | 2 -- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index 18e2c0fd46..1740bd3daa 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -17,6 +17,13 @@ internal static class DoubleParser private const ulong TopThreeBits = 0xE000000000000000UL; private const char InfinitySymbol = '\u221E'; + // The decimal marker that separates the integer part from the fractional part of a number + // written in decimal from can vary across different cultures as either '.' or ','. The + // default decimal marker in ML .NET is '.', however through this static char variable, + // we allow users to specify the decimal marker used in their datasets as ',' as well. + [BestFriend] + internal static char DecimalMarker = '.'; + // REVIEW: casting ulong to Double doesn't always do the right thing, for example // with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that // long to Double does work. The work around is: @@ -527,8 +534,6 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool Contracts.Assert(num == 0); Contracts.Assert(exp == 0); - const char decimalMarker = '.'; - if (ich >= span.Length) return false; @@ -556,7 +561,8 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool return false; break; - case decimalMarker: + case '.': + case ',': goto LPoint; // The common cases. @@ -595,12 +601,12 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool } Contracts.Assert(i < span.Length); - if (span[i] != decimalMarker) + if (span[i] != DecimalMarker) goto LAfterDigits; LPoint: Contracts.Assert(i < span.Length); - Contracts.Assert(span[i] == decimalMarker); + Contracts.Assert(span[i] == DecimalMarker); // Get the digits after the decimal marker, which may be '.' or ',' for (; ; ) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 0d0b2c179c..b69038a4d7 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -709,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker); + Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker); + Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker); + Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines); if (needInputSize && inputSize == 0) { @@ -1219,7 +1219,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo } } - if (options.DecimalMarker == ',' && _separators.Contains(',')) + if (_separators.Contains(options.DecimalMarker)) throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker); _decimalMarker = options.DecimalMarker; _bindings = new Bindings(this, cols, headerFile, dataSample); @@ -1607,6 +1607,7 @@ public BoundLoader(TextLoader loader, IMultiStreamSource files) public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null) { _host.CheckValueOrNull(rand); + DoubleParser.DecimalMarker = _loader._decimalMarker; var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded); return Cursor.Create(_loader, _files, active); } @@ -1614,6 +1615,7 @@ public DataViewRowCursor GetRowCursor(IEnumerable columns public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) { _host.CheckValueOrNull(rand); + DoubleParser.DecimalMarker = _loader._decimalMarker; var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded); return Cursor.CreateSet(_loader, _files, active, n); } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index f2ff6ec4a1..62f5709169 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1, parent._decimalMarker); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd, parent._decimalMarker); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; @@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter() }; } - public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines, char decimalMarker) + public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines) { Contracts.AssertValue(source); Contracts.Assert(count > 0); @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM count = 2; LineBatch batch; - var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1, decimalMarker); + var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1); try { batch = reader.GetBatch(); @@ -404,7 +404,6 @@ private sealed class LineReader private readonly bool _hasHeader; private readonly bool _readMultilines; private readonly char[] _separators; - private readonly char _decimalMarker; private readonly int _batchSize; private readonly IMultiStreamSource _files; @@ -414,7 +413,7 @@ private sealed class LineReader private Task _thdRead; private volatile bool _abort; - public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref, char decimalMarker) + public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref) { // Note that files is allowed to be empty. Contracts.AssertValue(files); @@ -431,7 +430,6 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has _separators = separators; _files = files; _cref = cref; - _decimalMarker = decimalMarker; _queue = new BlockingQueue(bufSize); _thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index c84cb1927c..13019c4bf2 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -633,7 +633,6 @@ public void Clear() } private readonly char[] _separators; - private readonly char _decimalMarker; private readonly OptionFlags _flags; private readonly int _inputSize; private readonly ColInfo[] _infos; @@ -684,7 +683,6 @@ public Parser(TextLoader parent) } _separators = parent._separators; - _decimalMarker = parent._decimalMarker; _flags = parent._flags; _inputSize = parent._inputSize; Contracts.Assert(_inputSize >= 0); From a663f210db833e1d48654c86ce0ff753230d9447 Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Wed, 20 May 2020 17:48:33 -0700 Subject: [PATCH 06/10] Added decimal marker check and removed decimalMarker from CreateTextLoader's constructor --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index b69038a4d7..9b5c50184e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1219,6 +1219,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo } } + if (options.DecimalMarker != '.' && options.DecimalMarker != ',') + throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker); if (_separators.Contains(options.DecimalMarker)) throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker); _decimalMarker = options.DecimalMarker; @@ -1493,14 +1495,12 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host, bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, bool trimWhitespace = Defaults.TrimWhitespace, - IMultiStreamSource dataSample = null, - char decimalMarker = Defaults.DecimalMarker) + IMultiStreamSource dataSample = null) { Options options = new Options { HasHeader = hasHeader, Separators = new[] { separator }, - DecimalMarker = decimalMarker, AllowQuoting = allowQuoting, AllowSparse = supportSparse, TrimWhitespace = trimWhitespace From 141fa7be65febd22e87b97723c2ba9e447972648 Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Thu, 21 May 2020 18:30:13 -0700 Subject: [PATCH 07/10] Added TextLoader decimalMarker unit tests, and refined logic in DoubleParser --- .../Utilities/DoubleParser.cs | 13 + .../DataLoadSave/Text/TextLoader.cs | 4 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 340 ++++++++++++++++-- test/data/iris-decimal-marker-as-comma.csv | 151 ++++++++ ...a.txt => iris-decimal-marker-as-comma.txt} | 0 5 files changed, 484 insertions(+), 24 deletions(-) create mode 100644 test/data/iris-decimal-marker-as-comma.csv rename test/data/{iris_decimal_marker_as_comma.txt => iris-decimal-marker-as-comma.txt} (100%) diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs index 1740bd3daa..22bd8ea82e 100644 --- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs +++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs @@ -17,6 +17,14 @@ internal static class DoubleParser private const ulong TopThreeBits = 0xE000000000000000UL; private const char InfinitySymbol = '\u221E'; + // Note for future development: DoubleParser is a static class and DecimalMarker is a + // static variable, which means only one instance of these can exist at once. As such, + // the value of DecimalMarker cannot vary when datasets with differing decimal markers + // are loaded together at once, which would result in not being able to accurately read + // the dataset with the differing decimal marker. Although this edge case where we attempt + // to load in datasets with different decimal markers at once is unlikely to occur, we + // should still be aware of this and plan to fix it in the future. + // The decimal marker that separates the integer part from the fractional part of a number // written in decimal from can vary across different cultures as either '.' or ','. The // default decimal marker in ML .NET is '.', however through this static char variable, @@ -562,7 +570,12 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool break; case '.': + if (DecimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error. + return false; // Since this was an error, return false, which will later make the caller to set NaN as the out value. + goto LPoint; case ',': + if (DecimalMarker != ',') // Same logic as above. + return false; goto LPoint; // The common cases. diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 9b5c50184e..83438290c9 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1071,7 +1071,7 @@ private static VersionInfo GetVersionInfo() //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags - verWrittenCur: 0x0001000D, // Added decimal marker option to allow for ',' to be a decimal marker + verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1221,8 +1221,6 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo if (options.DecimalMarker != '.' && options.DecimalMarker != ',') throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker); - if (_separators.Contains(options.DecimalMarker)) - throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker); _decimalMarker = options.DecimalMarker; _bindings = new Bindings(this, cols, headerFile, dataSample); _parser = new Parser(this); diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index e249808fda..201d8ae678 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -804,38 +804,336 @@ public void TestTextLoaderKeyTypeBackCompat() } [Fact] - public void TestCommaAsDecimalMarker() + public void TestTextLoaderBackCompat_VerWritt_0x0001000C() { - string dataPath = GetDataPath("iris_decimal_marker_as_comma.txt"); + // Checks backward compatibility with a text loader created with "verWrittenCur: 0x0001000C" + // Model generated with: + // loader=text{header+ col=SepalLength:Num:0 col=SepalWidth:Num:1 col=PetalLength:Num:2 col=PetalWidth:Num:2 col=Cat:TX:1-8 col=Num:9-14 col=Type:TX:4} + var mlContext = new MLContext(1); + string textLoaderModelPath = GetDataPath("backcompat/textloader_VerWritt_0x0001000C.zip"); + string irisPath = GetDataPath(TestDatasets.irisData.trainFilename); + + IDataView iris; + using (FileStream modelfs = File.OpenRead(textLoaderModelPath)) + using (var rep = RepositoryReader.Open(modelfs, mlContext)) + { + iris = ModelFileUtils.LoadLoader(mlContext, rep, new MultiFileSource(irisPath), false); + } + + var previewIris = iris.Preview(1); + var irisFirstRow = new Dictionary(); + irisFirstRow["SepalLength"] = 5.1f; + irisFirstRow["SepalWidth"] = 3.5f; + irisFirstRow["PetalLength"] = 1.4f; + irisFirstRow["PetalWidth"] = 0.2f; + + Assert.Equal(5, previewIris.ColumnView.Length); + Assert.Equal("SepalLength", previewIris.Schema[0].Name); + Assert.Equal(NumberDataViewType.Single, previewIris.Schema[0].Type); + int index = 0; + foreach (var entry in irisFirstRow) + { + Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key); + Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value); + } + Assert.Equal("Type", previewIris.RowView[0].Values[index].Key); + Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); + } + + [Fact] + public void TestCommaAsDecimalMarkerFloat() + { + // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their + // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt. + // Here, the features are of type float (Single), and the test checks for decimal markers with floats. + var mlContext = new MLContext(seed: 1); + + UInt32[] labels = new uint[150]; + float[][] features = new float[150][]; + + // Read dataset with period as decimal marker. + string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); + var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = '.' + }); + var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod)); + + // Load values from iris.txt + DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema; + using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod); + UInt32 labelPeriod = default; + ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]); + VBuffer featuresPeriod = default; + ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); + + // Iterate over each row and save labels and features to array for future comparison + int count = 0; + while (cursorPeriod.MoveNext()) + { + //Get values from respective columns + labelDelegatePeriod(ref labelPeriod); + featuresDelegatePeriod(ref featuresPeriod); + labels[count] = labelPeriod; + features[count] = featuresPeriod.GetValues().ToArray(); + count++; + } + + // Read dataset with comma as decimal marker. + string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); + var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = ',' + }); + var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); + + // Load values from iris-decimal-marker-as-comma.txt + DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; + using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); + UInt32 labelComma = default; + ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); + VBuffer featuresComma = default; + ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); + + // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt + count = 0; + while (cursorComma.MoveNext()) + { + //Get values from respective columns + labelDelegateComma(ref labelComma); + featuresDelegateComma(ref featuresComma); + Assert.Equal(labels[count], labelComma); + Assert.Equal(features[count], featuresComma.GetValues().ToArray()); + count++; + } + } + + [Fact] + public void TestCommaAsDecimalMarkerDouble() + { + // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their + // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt. + // Here, the features are of type double, and the test checks for decimal markers with double. + var mlContext = new MLContext(seed: 1); + + // Read dataset with period as decimal marker. + string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); + var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = '.' + }); + var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod)); + + // Load values from iris.txt + DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema; + using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod); + UInt32 labelPeriod = default; + ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]); + VBuffer featuresPeriod = default; + ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); + + UInt32[] labels = new uint[150]; + double[][] features = new double[150][]; + + // Iterate over each row and save labels and features to array for future comparison + int count = 0; + while (cursorPeriod.MoveNext()) + { + //Get values from respective columns + labelDelegatePeriod(ref labelPeriod); + featuresDelegatePeriod(ref featuresPeriod); + labels[count] = labelPeriod; + features[count] = featuresPeriod.GetValues().ToArray(); + count++; + } + + // Read dataset with comma as decimal marker. + string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); + var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = ',' + }); + var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); + + // Load values from iris-decimal-marker-as-comma.txt + DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; + using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); + UInt32 labelComma = default; + ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); + VBuffer featuresComma = default; + ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); + + // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt + count = 0; + while (cursorComma.MoveNext()) + { + //Get values from respective columns + labelDelegateComma(ref labelComma); + featuresDelegateComma(ref featuresComma); + Assert.Equal(labels[count], labelComma); + Assert.Equal(features[count], featuresComma.GetValues().ToArray()); + count++; + } + } + + [Fact] + public void TestWrongDecimalMarkerInputs() + { + // When DecimalMarker does not match the actual decimal marker used in the dataset, + // we obtain values of NaN. Check that the values are indeed NaN in this case. + var mlContext = new MLContext(seed: 1); + + // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ','. + string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); + var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = ',' + }); + var textDataMismatched1 = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerPeriod)); + + // Check that the features being loaded are NaN. + DataViewSchema columnsPeriod = textDataMismatched1.Schema; + using DataViewRowCursor cursorPeriod = textDataMismatched1.GetRowCursor(columnsPeriod); + VBuffer featuresPeriod = default; + ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); + + // Iterate over each row + while (cursorPeriod.MoveNext()) + { + featuresDelegatePeriod.Invoke(ref featuresPeriod); + foreach(float feature in featuresPeriod.GetValues().ToArray()) + Assert.Equal(feature, Single.NaN); + } + + // Try reading a dataset where ',' is the actual decimal marker, but DecimalMarker = '.'. + string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); + var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, + DecimalMarker = '.' + }); + var textDataMismatched2 = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerComma)); + + DataViewSchema columnsComma = textDataMismatched2.Schema; + using DataViewRowCursor cursorComma = textDataMismatched2.GetRowCursor(columnsComma); + VBuffer featuresComma = default; + ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); + + // Iterate over each row + while (cursorComma.MoveNext()) + { + featuresDelegateComma.Invoke(ref featuresComma); + foreach (float feature in featuresComma.GetValues().ToArray()) + Assert.Equal(feature, Single.NaN); + } + } + + [Fact] + public void TestCommaAsDecimalMarkerWithSeperatorAsCommaInCSV() + { + // Check to confirm TextLoader can read data from a CSV file where the separator is ',' and decimals + // enclosed with quotes and with the decimal marker ',' can be successfully read. + string dataPathCsv = GetDataPath("iris-decimal-marker-as-comma.csv"); - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(seed: 1); - var reader = new TextLoader(mlContext, new TextLoader.Options() + var readerCsv = new TextLoader(mlContext, new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.Single, 0), new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), }, + DecimalMarker = ',', + Separator = ",", + AllowQuoting = true, + HasHeader = true + }); + var textDataCsv = readerCsv.Load(GetDataPath(dataPathCsv)); + + // Load values from iris-decimal-marker-as-comma.csv + DataViewSchema columnsCsv = textDataCsv.Schema; + using DataViewRowCursor cursorCsv = textDataCsv.GetRowCursor(columnsCsv); + UInt32 labelCsv = default; + ValueGetter labelDelegatePeriod = cursorCsv.GetGetter(columnsCsv[0]); + VBuffer featuresCsv = default; + ValueGetter> featuresDelegatePeriod = cursorCsv.GetGetter>(columnsCsv[1]); + + UInt32[] labels = new uint[150]; + double[][] features = new double[150][]; + + // Iterate over each row and save labels and features to array for future comparison + int count = 0; + while (cursorCsv.MoveNext()) + { + //Get values from respective columns + labelDelegatePeriod(ref labelCsv); + featuresDelegatePeriod(ref featuresCsv); + labels[count] = labelCsv; + features[count] = featuresCsv.GetValues().ToArray(); + count++; + } + + // Read dataset with comma as decimal marker. + string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); + var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.UInt32, 0), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + }, DecimalMarker = ',' }); - // Data - var textData = reader.Load(GetDataPath(dataPath)); - var data = mlContext.Data.Cache(mlContext.Transforms.Conversion.MapValueToKey("Label") - .Fit(textData).Transform(textData)); - - // Pipeline - var pipeline = mlContext.MulticlassClassification.Trainers.OneVersusAll( - mlContext.BinaryClassification.Trainers.LinearSvm(new Trainers.LinearSvmTrainer.Options { NumberOfIterations = 100 }), - useProbabilities: false); - - var model = pipeline.Fit(data); - var predictions = model.Transform(data); - - // Metrics - var metrics = mlContext.MulticlassClassification.Evaluate(predictions); - Assert.True(metrics.MicroAccuracy > 0.83); + var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); + + // Load values from iris-decimal-marker-as-comma.txt + DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; + using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); + UInt32 labelComma = default; + ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); + VBuffer featuresComma = default; + ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); + + // Check values from iris-decimal-marker-as-comma.txt match those in iris-decimal-marker-as-comma.csv + count = 0; + while (cursorComma.MoveNext()) + { + //Get values from respective columns + labelDelegateComma(ref labelComma); + featuresDelegateComma(ref featuresComma); + Assert.Equal(labels[count], labelComma); + Assert.Equal(features[count], featuresComma.GetValues().ToArray()); + count++; + } } private class IrisNoFields diff --git a/test/data/iris-decimal-marker-as-comma.csv b/test/data/iris-decimal-marker-as-comma.csv new file mode 100644 index 0000000000..0612dd281e --- /dev/null +++ b/test/data/iris-decimal-marker-as-comma.csv @@ -0,0 +1,151 @@ +Label,Sepal,length,Sepal,width,Petal length,Petal width +0,"5,1","3,5","1,4","0.2" +0,"4,9","3,0","1,4","0.2" +0,"4,7","3,2","1,3","0.2" +0,"4,6","3,1","1,5","0.2" +0,"5,0","3,6","1,4","0.2" +0,"5,4","3,9","1,7","0.4" +0,"4,6","3,4","1,4","0.3" +0,"5,0","3,4","1,5","0.2" +0,"4,4","2,9","1,4","0.2" +0,"4,9","3,1","1,5","0.1" +0,"5,4","3,7","1,5","0.2" +0,"4,8","3,4","1,6","0.2" +0,"4,8","3,0","1,4","0.1" +0,"4,3","3,0","1,1","0.1" +0,"5,8","4,0","1,2","0.2" +0,"5,7","4,4","1,5","0.4" +0,"5,4","3,9","1,3","0.4" +0,"5,1","3,5","1,4","0.3" +0,"5,7","3,8","1,7","0.3" +0,"5,1","3,8","1,5","0.3" +0,"5,4","3,4","1,7","0.2" +0,"5,1","3,7","1,5","0.4" +0,"4,6","3,6","1,0","0.2" +0,"5,1","3,3","1,7","0,5" +0,"4,8","3,4","1,9","0.2" +0,"5,0","3,0","1,6","0.2" +0,"5,0","3,4","1,6","0.4" +0,"5,2","3,5","1,5","0.2" +0,"5,2","3,4","1,4","0.2" +0,"4,7","3,2","1,6","0.2" +0,"4,8","3,1","1,6","0.2" +0,"5,4","3,4","1,5","0.4" +0,"5,2","4,1","1,5","0.1" +0,"5,5","4,2","1,4","0.2" +0,"4,9","3,1","1,5","0.1" +0,"5,0","3,2","1,2","0.2" +0,"5,5","3,5","1,3","0.2" +0,"4,9","3,1","1,5","0.1" +0,"4,4","3,0","1,3","0.2" +0,"5,1","3,4","1,5","0.2" +0,"5,0","3,5","1,3","0.3" +0,"4,5","2,3","1,3","0.3" +0,"4,4","3,2","1,3","0.2" +0,"5,0","3,5","1,6","0.6" +0,"5,1","3,8","1,9","0.4" +0,"4,8","3,0","1,4","0.3" +0,"5,1","3,8","1,6","0.2" +0,"4,6","3,2","1,4","0.2" +0,"5,3","3,7","1,5","0.2" +0,"5,0","3,3","1,4","0.2" +0,"7,0","3,2","4,7","1,4 +0,"6,4","3,2","4,5","1,5 +0,"6,9","3,1","4,9","1,5 +0,"5,5","2,3","4,0","1,3 +0,"6,5","2,8","4,6","1,5 +0,"5,7","2,8","4,5","1,3 +0,"6,3","3,3","4,7","1,6 +0,"4,9","2,4","3,3","1,0 +0,"6,6","2,9","4,6","1,3 +0,"5,2","2,7","3,9","1,4 +0,"5,0","2,0","3,5","1,0 +0,"5,9","3,0","4,2","1,5 +0,"6,0","2,2","4,0","1,0 +0,"6,1","2,9","4,7","1,4 +0,"5,6","2,9","3,6","1,3 +0,"6,7","3,1","4,4","1,4 +0,"5,6","3,0","4,5","1,5 +0,"5,8","2,7","4,1","1,0 +0,"6,2","2,2","4,5","1,5 +0,"5,6","2,5","3,9","1,1 +0,"5,9","3,2","4,8","1,8 +0,"6,1","2,8","4,0","1,3 +0,"6,3","2,5","4,9","1,5 +0,"6,1","2,8","4,7","1,2 +0,"6,4","2,9","4,3","1,3 +0,"6,6","3,0","4,4","1,4 +0,"6,8","2,8","4,8","1,4 +0,"6,7","3,0","5,0","1,7 +0,"6,0","2,9","4,5","1,5 +0,"5,7","2,6","3,5","1,0 +0,"5,5","2,4","3,8","1,1 +0,"5,5","2,4","3,7","1,0 +0,"5,8","2,7","3,9","1,2 +0,"6,0","2,7","5,1","1,6 +0,"5,4","3,0","4,5","1,5 +0,"6,0","3,4","4,5","1,6 +0,"6,7","3,1","4,7","1,5 +0,"6,3","2,3","4,4","1,3 +0,"5,6","3,0","4,1","1,3 +0,"5,5","2,5","4,0","1,3 +0,"5,5","2,6","4,4","1,2 +0,"6,1","3,0","4,6","1,4 +0,"5,8","2,6","4,0","1,2 +0,"5,0","2,3","3,3","1,0 +0,"5,6","2,7","4,2","1,3 +0,"5,7","3,0","4,2","1,2 +0,"5,7","2,9","4,2","1,3 +0,"6,2","2,9","4,3","1,3 +0,"5,1","2,5","3,0","1,1 +0,"5,7","2,8","4,1","1,3 +0,"6,3","3,3","6,0","2,5 +0,"5,8","2,7","5,1","1,9 +0,"7,1","3,0","5,9","2,1 +0,"6,3","2,9","5,6","1,8 +0,"6,5","3,0","5,8","2,2 +0,"7,6","3,0","6,6","2,1 +0,"4,9","2,5","4,5","1,7 +0,"7,3","2,9","6,3","1,8 +0,"6,7","2,5","5,8","1,8 +0,"7,2","3,6","6,1","2,5 +0,"6,5","3,2","5,1","2,0 +0,"6,4","2,7","5,3","1,9 +0,"6,8","3,0","5,5","2,1 +0,"5,7","2,5","5,0","2,0 +0,"5,8","2,8","5,1","2,4 +0,"6,4","3,2","5,3","2,3 +0,"6,5","3,0","5,5","1,8 +0,"7,7","3,8","6,7","2,2 +0,"7,7","2,6","6,9","2,3 +0,"6,0","2,2","5,0","1,5 +0,"6,9","3,2","5,7","2,3 +0,"5,6","2,8","4,9","2,0 +0,"7,7","2,8","6,7","2,0 +0,"6,3","2,7","4,9","1,8 +0,"6,7","3,3","5,7","2,1 +0,"7,2","3,2","6,0","1,8 +0,"6,2","2,8","4,8","1,8 +0,"6,1","3,0","4,9","1,8 +0,"6,4","2,8","5,6","2,1 +0,"7,2","3,0","5,8","1,6 +0,"7,4","2,8","6,1","1,9 +0,"7,9","3,8","6,4","2,0 +0,"6,4","2,8","5,6","2,2 +0,"6,3","2,8","5,1","1,5 +0,"6,1","2,6","5,6","1,4 +0,"7,7","3,0","6,1","2,3 +0,"6,3","3,4","5,6","2,4 +0,"6,4","3,1","5,5","1,8 +0,"6,0","3,0","4,8","1,8 +0,"6,9","3,1","5,4","2,1 +0,"6,7","3,1","5,6","2,4 +0,"6,9","3,1","5,1","2,3 +0,"5,8","2,7","5,1","1,9 +0,"6,8","3,2","5,9","2,3 +0,"6,7","3,3","5,7","2,5 +0,"6,7","3,0","5,2","2,3 +0,"6,3","2,5","5,0","1,9 +0,"6,5","3,0","5,2","2,0 +0,"6,2","3,4","5,4","2,3 +0,"5,9","3,0","5,1","1,8 diff --git a/test/data/iris_decimal_marker_as_comma.txt b/test/data/iris-decimal-marker-as-comma.txt similarity index 100% rename from test/data/iris_decimal_marker_as_comma.txt rename to test/data/iris-decimal-marker-as-comma.txt From 142f1305065f02dab08f3e4c580719a6e3b72897 Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Thu, 21 May 2020 20:37:02 -0700 Subject: [PATCH 08/10] Refine tests, logic, csv dataset --- .../DataLoadSave/Text/TextLoader.cs | 5 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 259 ++++++--------- test/data/iris-decimal-marker-as-comma.csv | 298 +++++++++--------- 3 files changed, 247 insertions(+), 315 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index b08635ad59..2c64b98a14 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -475,7 +475,7 @@ public class Options public char[] Separators = new[] { Defaults.Separator }; /// - /// The character that should be used as the decimal marker. + /// The character that should be used as the decimal marker. Default value is '.'. Only '.' and ',' are allowed to be decimal markers. /// [Argument(ArgumentType.AtMostOnce, Name = "Decimal Marker", HelpText = "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.", ShortName = "decimal")] public char DecimalMarker = Defaults.DecimalMarker; @@ -1229,6 +1229,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo if (options.DecimalMarker != '.' && options.DecimalMarker != ',') throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker); + if (!options.AllowQuoting && options.DecimalMarker == ',' && options.Separator == ",") + throw _host.ExceptUserArg(nameof(Options.AllowQuoting), "Quoting must be allowed if decimal marker and separator are the ',' character."); _decimalMarker = options.DecimalMarker; _escapeChar = options.EscapeChar; if(_separators.Contains(_escapeChar)) @@ -1432,6 +1434,7 @@ private TextLoader(IHost host, ModelLoadContext ctx) else { _escapeChar = Defaults.EscapeChar; + _decimalMarker = Defaults.DecimalMarker; } host.CheckDecode(!_separators.Contains(_escapeChar)); diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index bd0096c82c..9e31ff20f3 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -840,80 +840,101 @@ public void TestTextLoaderBackCompat_VerWritt_0x0001000C() Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); } - [Fact] - public void TestCommaAsDecimalMarkerFloat() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestCommaAsDecimalMarkerFloat(bool useCsvVersion) { + // When userCsvVersion == false: // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt. // Here, the features are of type float (Single), and the test checks for decimal markers with floats. - var mlContext = new MLContext(seed: 1); - UInt32[] labels = new uint[150]; - float[][] features = new float[150][]; + // When userCsvVersion == true: + // Check to confirm TextLoader can read data from a CSV file where the separator is ',', decimals + // enclosed with quotes, and with the decimal marker being ','. Features are of type float (Single), + // and the test checks for decimal markers with floats. + var mlContext = new MLContext(seed: 1); - // Read dataset with period as decimal marker. - string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); - var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() + string dataPathDecimalMarkerComma; + TextLoader.Options options = new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }) }, - DecimalMarker = '.' - }); - var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod)); + }; + if (useCsvVersion) + { + dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv"); + options.DecimalMarker = ','; + options.Separator = ","; + options.AllowQuoting = true; + options.HasHeader = true; + } + else + { + dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); + options.DecimalMarker = ','; + } - // Load values from iris.txt - DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema; - using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod); - UInt32 labelPeriod = default; - ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]); - VBuffer featuresPeriod = default; - ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); + // Read dataset with comma as decimal marker. + var readerDecimalMarkerComma = new TextLoader(mlContext, options); + var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); + + // Load values from iris database with comma as decimal marker. + DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; + using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); + UInt32 labelComma = default; + ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); + VBuffer featuresComma = default; + ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); // Iterate over each row and save labels and features to array for future comparison int count = 0; - while (cursorPeriod.MoveNext()) + UInt32[] labels = new uint[150]; + float[][] features = new float[150][]; + while (cursorComma.MoveNext()) { //Get values from respective columns - labelDelegatePeriod(ref labelPeriod); - featuresDelegatePeriod(ref featuresPeriod); - labels[count] = labelPeriod; - features[count] = featuresPeriod.GetValues().ToArray(); + labelDelegateComma(ref labelComma); + featuresDelegateComma(ref featuresComma); + labels[count] = labelComma; + features[count] = featuresComma.GetValues().ToArray(); count++; } - // Read dataset with comma as decimal marker. - string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); - var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + // Read dataset with period as decimal marker. + string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); + var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), }, - DecimalMarker = ',' + DecimalMarker = '.' }); - var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); + var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod)); - // Load values from iris-decimal-marker-as-comma.txt - DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; - using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); - UInt32 labelComma = default; - ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); - VBuffer featuresComma = default; - ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); + // Load values from iris.txt where '.' is the decimal marker. + DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema; + using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod); + UInt32 labelPeriod = default; + ValueGetter labelDelegatePeriod = cursorComma.GetGetter(columnsPeriod[0]); + VBuffer featuresPeriod = default; + ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); - // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt + // Check values from database with ',' as decimal marker with database with '.' as decimal marker. count = 0; while (cursorComma.MoveNext()) { //Get values from respective columns - labelDelegateComma(ref labelComma); - featuresDelegateComma(ref featuresComma); - Assert.Equal(labels[count], labelComma); - Assert.Equal(features[count], featuresComma.GetValues().ToArray()); + labelDelegatePeriod(ref labelPeriod); + featuresDelegatePeriod(ref featuresPeriod); + Assert.Equal(labels[count], labelPeriod); + Assert.Equal(features[count], featuresPeriod.GetValues().ToArray()); count++; } } @@ -933,7 +954,7 @@ public void TestCommaAsDecimalMarkerDouble() Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }), }, DecimalMarker = '.' }); @@ -947,11 +968,10 @@ public void TestCommaAsDecimalMarkerDouble() VBuffer featuresPeriod = default; ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); - UInt32[] labels = new uint[150]; - double[][] features = new double[150][]; - // Iterate over each row and save labels and features to array for future comparison int count = 0; + UInt32[] labels = new uint[150]; + double[][] features = new double[150][]; while (cursorPeriod.MoveNext()) { //Get values from respective columns @@ -969,7 +989,7 @@ public void TestCommaAsDecimalMarkerDouble() Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }), }, DecimalMarker = ',' }); @@ -996,144 +1016,53 @@ public void TestCommaAsDecimalMarkerDouble() } } - [Fact] - public void TestWrongDecimalMarkerInputs() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestWrongDecimalMarkerInputs(bool useCommaAsDecimalMarker) { // When DecimalMarker does not match the actual decimal marker used in the dataset, // we obtain values of NaN. Check that the values are indeed NaN in this case. + // Do this check for both cases where decimal markers in the dataset are '.' and ','. var mlContext = new MLContext(seed: 1); - // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ','. - string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); - var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ',', + // and vice versa. + string dataPath; + TextLoader.Options options = new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), + new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }) }, - DecimalMarker = ',' - }); - var textDataMismatched1 = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerPeriod)); + }; + if (useCommaAsDecimalMarker) + { + dataPath = GetDataPath("iris.txt"); // Has '.' as decimal marker inside dataset + options.DecimalMarker = ','; // Choose wrong decimal marker on purpose + } + else + { + dataPath = GetDataPath("iris-decimal-marker-as-comma.txt"); // Has ',' as decimal marker inside dataset + options.DecimalMarker = '.'; // Choose wrong decimal marker on purpose + } + var reader = new TextLoader(mlContext, options); + var textData = reader.Load(GetDataPath(dataPath)); // Check that the features being loaded are NaN. - DataViewSchema columnsPeriod = textDataMismatched1.Schema; - using DataViewRowCursor cursorPeriod = textDataMismatched1.GetRowCursor(columnsPeriod); + DataViewSchema columns = textData.Schema; + using DataViewRowCursor cursor = textData.GetRowCursor(columns); VBuffer featuresPeriod = default; - ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); + ValueGetter> featuresDelegatePeriod = cursor.GetGetter>(columns[1]); - // Iterate over each row - while (cursorPeriod.MoveNext()) + // Iterate over each row and check that feature values are NaN. + while (cursor.MoveNext()) { featuresDelegatePeriod.Invoke(ref featuresPeriod); foreach(float feature in featuresPeriod.GetValues().ToArray()) Assert.Equal(feature, Single.NaN); } - - // Try reading a dataset where ',' is the actual decimal marker, but DecimalMarker = '.'. - string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); - var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() - { - Columns = new[] - { - new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), - }, - DecimalMarker = '.' - }); - var textDataMismatched2 = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerComma)); - - DataViewSchema columnsComma = textDataMismatched2.Schema; - using DataViewRowCursor cursorComma = textDataMismatched2.GetRowCursor(columnsComma); - VBuffer featuresComma = default; - ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); - - // Iterate over each row - while (cursorComma.MoveNext()) - { - featuresDelegateComma.Invoke(ref featuresComma); - foreach (float feature in featuresComma.GetValues().ToArray()) - Assert.Equal(feature, Single.NaN); - } - } - - [Fact] - public void TestCommaAsDecimalMarkerWithSeperatorAsCommaInCSV() - { - // Check to confirm TextLoader can read data from a CSV file where the separator is ',' and decimals - // enclosed with quotes and with the decimal marker ',' can be successfully read. - string dataPathCsv = GetDataPath("iris-decimal-marker-as-comma.csv"); - - var mlContext = new MLContext(seed: 1); - var readerCsv = new TextLoader(mlContext, new TextLoader.Options() - { - Columns = new[] - { - new TextLoader.Column("Label", DataKind.Single, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), - }, - DecimalMarker = ',', - Separator = ",", - AllowQuoting = true, - HasHeader = true - }); - var textDataCsv = readerCsv.Load(GetDataPath(dataPathCsv)); - - // Load values from iris-decimal-marker-as-comma.csv - DataViewSchema columnsCsv = textDataCsv.Schema; - using DataViewRowCursor cursorCsv = textDataCsv.GetRowCursor(columnsCsv); - UInt32 labelCsv = default; - ValueGetter labelDelegatePeriod = cursorCsv.GetGetter(columnsCsv[0]); - VBuffer featuresCsv = default; - ValueGetter> featuresDelegatePeriod = cursorCsv.GetGetter>(columnsCsv[1]); - - UInt32[] labels = new uint[150]; - double[][] features = new double[150][]; - - // Iterate over each row and save labels and features to array for future comparison - int count = 0; - while (cursorCsv.MoveNext()) - { - //Get values from respective columns - labelDelegatePeriod(ref labelCsv); - featuresDelegatePeriod(ref featuresCsv); - labels[count] = labelCsv; - features[count] = featuresCsv.GetValues().ToArray(); - count++; - } - - // Read dataset with comma as decimal marker. - string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); - var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() - { - Columns = new[] - { - new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), - }, - DecimalMarker = ',' - }); - var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); - - // Load values from iris-decimal-marker-as-comma.txt - DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; - using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); - UInt32 labelComma = default; - ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); - VBuffer featuresComma = default; - ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); - - // Check values from iris-decimal-marker-as-comma.txt match those in iris-decimal-marker-as-comma.csv - count = 0; - while (cursorComma.MoveNext()) - { - //Get values from respective columns - labelDelegateComma(ref labelComma); - featuresDelegateComma(ref featuresComma); - Assert.Equal(labels[count], labelComma); - Assert.Equal(features[count], featuresComma.GetValues().ToArray()); - count++; - } } private class IrisNoFields diff --git a/test/data/iris-decimal-marker-as-comma.csv b/test/data/iris-decimal-marker-as-comma.csv index 0612dd281e..db1986ec8e 100644 --- a/test/data/iris-decimal-marker-as-comma.csv +++ b/test/data/iris-decimal-marker-as-comma.csv @@ -1,151 +1,151 @@ Label,Sepal,length,Sepal,width,Petal length,Petal width -0,"5,1","3,5","1,4","0.2" -0,"4,9","3,0","1,4","0.2" -0,"4,7","3,2","1,3","0.2" -0,"4,6","3,1","1,5","0.2" -0,"5,0","3,6","1,4","0.2" -0,"5,4","3,9","1,7","0.4" -0,"4,6","3,4","1,4","0.3" -0,"5,0","3,4","1,5","0.2" -0,"4,4","2,9","1,4","0.2" -0,"4,9","3,1","1,5","0.1" -0,"5,4","3,7","1,5","0.2" -0,"4,8","3,4","1,6","0.2" -0,"4,8","3,0","1,4","0.1" -0,"4,3","3,0","1,1","0.1" -0,"5,8","4,0","1,2","0.2" -0,"5,7","4,4","1,5","0.4" -0,"5,4","3,9","1,3","0.4" -0,"5,1","3,5","1,4","0.3" -0,"5,7","3,8","1,7","0.3" -0,"5,1","3,8","1,5","0.3" -0,"5,4","3,4","1,7","0.2" -0,"5,1","3,7","1,5","0.4" -0,"4,6","3,6","1,0","0.2" +0,"5,1","3,5","1,4","0,2" +0,"4,9","3,0","1,4","0,2" +0,"4,7","3,2","1,3","0,2" +0,"4,6","3,1","1,5","0,2" +0,"5,0","3,6","1,4","0,2" +0,"5,4","3,9","1,7","0,4" +0,"4,6","3,4","1,4","0,3" +0,"5,0","3,4","1,5","0,2" +0,"4,4","2,9","1,4","0,2" +0,"4,9","3,1","1,5","0,1" +0,"5,4","3,7","1,5","0,2" +0,"4,8","3,4","1,6","0,2" +0,"4,8","3,0","1,4","0,1" +0,"4,3","3,0","1,1","0,1" +0,"5,8","4,0","1,2","0,2" +0,"5,7","4,4","1,5","0,4" +0,"5,4","3,9","1,3","0,4" +0,"5,1","3,5","1,4","0,3" +0,"5,7","3,8","1,7","0,3" +0,"5,1","3,8","1,5","0,3" +0,"5,4","3,4","1,7","0,2" +0,"5,1","3,7","1,5","0,4" +0,"4,6","3,6","1,0","0,2" 0,"5,1","3,3","1,7","0,5" -0,"4,8","3,4","1,9","0.2" -0,"5,0","3,0","1,6","0.2" -0,"5,0","3,4","1,6","0.4" -0,"5,2","3,5","1,5","0.2" -0,"5,2","3,4","1,4","0.2" -0,"4,7","3,2","1,6","0.2" -0,"4,8","3,1","1,6","0.2" -0,"5,4","3,4","1,5","0.4" -0,"5,2","4,1","1,5","0.1" -0,"5,5","4,2","1,4","0.2" -0,"4,9","3,1","1,5","0.1" -0,"5,0","3,2","1,2","0.2" -0,"5,5","3,5","1,3","0.2" -0,"4,9","3,1","1,5","0.1" -0,"4,4","3,0","1,3","0.2" -0,"5,1","3,4","1,5","0.2" -0,"5,0","3,5","1,3","0.3" -0,"4,5","2,3","1,3","0.3" -0,"4,4","3,2","1,3","0.2" -0,"5,0","3,5","1,6","0.6" -0,"5,1","3,8","1,9","0.4" -0,"4,8","3,0","1,4","0.3" -0,"5,1","3,8","1,6","0.2" -0,"4,6","3,2","1,4","0.2" -0,"5,3","3,7","1,5","0.2" -0,"5,0","3,3","1,4","0.2" -0,"7,0","3,2","4,7","1,4 -0,"6,4","3,2","4,5","1,5 -0,"6,9","3,1","4,9","1,5 -0,"5,5","2,3","4,0","1,3 -0,"6,5","2,8","4,6","1,5 -0,"5,7","2,8","4,5","1,3 -0,"6,3","3,3","4,7","1,6 -0,"4,9","2,4","3,3","1,0 -0,"6,6","2,9","4,6","1,3 -0,"5,2","2,7","3,9","1,4 -0,"5,0","2,0","3,5","1,0 -0,"5,9","3,0","4,2","1,5 -0,"6,0","2,2","4,0","1,0 -0,"6,1","2,9","4,7","1,4 -0,"5,6","2,9","3,6","1,3 -0,"6,7","3,1","4,4","1,4 -0,"5,6","3,0","4,5","1,5 -0,"5,8","2,7","4,1","1,0 -0,"6,2","2,2","4,5","1,5 -0,"5,6","2,5","3,9","1,1 -0,"5,9","3,2","4,8","1,8 -0,"6,1","2,8","4,0","1,3 -0,"6,3","2,5","4,9","1,5 -0,"6,1","2,8","4,7","1,2 -0,"6,4","2,9","4,3","1,3 -0,"6,6","3,0","4,4","1,4 -0,"6,8","2,8","4,8","1,4 -0,"6,7","3,0","5,0","1,7 -0,"6,0","2,9","4,5","1,5 -0,"5,7","2,6","3,5","1,0 -0,"5,5","2,4","3,8","1,1 -0,"5,5","2,4","3,7","1,0 -0,"5,8","2,7","3,9","1,2 -0,"6,0","2,7","5,1","1,6 -0,"5,4","3,0","4,5","1,5 -0,"6,0","3,4","4,5","1,6 -0,"6,7","3,1","4,7","1,5 -0,"6,3","2,3","4,4","1,3 -0,"5,6","3,0","4,1","1,3 -0,"5,5","2,5","4,0","1,3 -0,"5,5","2,6","4,4","1,2 -0,"6,1","3,0","4,6","1,4 -0,"5,8","2,6","4,0","1,2 -0,"5,0","2,3","3,3","1,0 -0,"5,6","2,7","4,2","1,3 -0,"5,7","3,0","4,2","1,2 -0,"5,7","2,9","4,2","1,3 -0,"6,2","2,9","4,3","1,3 -0,"5,1","2,5","3,0","1,1 -0,"5,7","2,8","4,1","1,3 -0,"6,3","3,3","6,0","2,5 -0,"5,8","2,7","5,1","1,9 -0,"7,1","3,0","5,9","2,1 -0,"6,3","2,9","5,6","1,8 -0,"6,5","3,0","5,8","2,2 -0,"7,6","3,0","6,6","2,1 -0,"4,9","2,5","4,5","1,7 -0,"7,3","2,9","6,3","1,8 -0,"6,7","2,5","5,8","1,8 -0,"7,2","3,6","6,1","2,5 -0,"6,5","3,2","5,1","2,0 -0,"6,4","2,7","5,3","1,9 -0,"6,8","3,0","5,5","2,1 -0,"5,7","2,5","5,0","2,0 -0,"5,8","2,8","5,1","2,4 -0,"6,4","3,2","5,3","2,3 -0,"6,5","3,0","5,5","1,8 -0,"7,7","3,8","6,7","2,2 -0,"7,7","2,6","6,9","2,3 -0,"6,0","2,2","5,0","1,5 -0,"6,9","3,2","5,7","2,3 -0,"5,6","2,8","4,9","2,0 -0,"7,7","2,8","6,7","2,0 -0,"6,3","2,7","4,9","1,8 -0,"6,7","3,3","5,7","2,1 -0,"7,2","3,2","6,0","1,8 -0,"6,2","2,8","4,8","1,8 -0,"6,1","3,0","4,9","1,8 -0,"6,4","2,8","5,6","2,1 -0,"7,2","3,0","5,8","1,6 -0,"7,4","2,8","6,1","1,9 -0,"7,9","3,8","6,4","2,0 -0,"6,4","2,8","5,6","2,2 -0,"6,3","2,8","5,1","1,5 -0,"6,1","2,6","5,6","1,4 -0,"7,7","3,0","6,1","2,3 -0,"6,3","3,4","5,6","2,4 -0,"6,4","3,1","5,5","1,8 -0,"6,0","3,0","4,8","1,8 -0,"6,9","3,1","5,4","2,1 -0,"6,7","3,1","5,6","2,4 -0,"6,9","3,1","5,1","2,3 -0,"5,8","2,7","5,1","1,9 -0,"6,8","3,2","5,9","2,3 -0,"6,7","3,3","5,7","2,5 -0,"6,7","3,0","5,2","2,3 -0,"6,3","2,5","5,0","1,9 -0,"6,5","3,0","5,2","2,0 -0,"6,2","3,4","5,4","2,3 -0,"5,9","3,0","5,1","1,8 +0,"4,8","3,4","1,9","0,2" +0,"5,0","3,0","1,6","0,2" +0,"5,0","3,4","1,6","0,4" +0,"5,2","3,5","1,5","0,2" +0,"5,2","3,4","1,4","0,2" +0,"4,7","3,2","1,6","0,2" +0,"4,8","3,1","1,6","0,2" +0,"5,4","3,4","1,5","0,4" +0,"5,2","4,1","1,5","0,1" +0,"5,5","4,2","1,4","0,2" +0,"4,9","3,1","1,5","0,1" +0,"5,0","3,2","1,2","0,2" +0,"5,5","3,5","1,3","0,2" +0,"4,9","3,1","1,5","0,1" +0,"4,4","3,0","1,3","0,2" +0,"5,1","3,4","1,5","0,2" +0,"5,0","3,5","1,3","0,3" +0,"4,5","2,3","1,3","0,3" +0,"4,4","3,2","1,3","0,2" +0,"5,0","3,5","1,6","0,6" +0,"5,1","3,8","1,9","0,4" +0,"4,8","3,0","1,4","0,3" +0,"5,1","3,8","1,6","0,2" +0,"4,6","3,2","1,4","0,2" +0,"5,3","3,7","1,5","0,2" +0,"5,0","3,3","1,4","0,2" +1,"7,0","3,2","4,7","1,4" +1,"6,4","3,2","4,5","1,5" +1,"6,9","3,1","4,9","1,5" +1,"5,5","2,3","4,0","1,3" +1,"6,5","2,8","4,6","1,5" +1,"5,7","2,8","4,5","1,3" +1,"6,3","3,3","4,7","1,6" +1,"4,9","2,4","3,3","1,0" +1,"6,6","2,9","4,6","1,3" +1,"5,2","2,7","3,9","1,4" +1,"5,0","2,0","3,5","1,0" +1,"5,9","3,0","4,2","1,5" +1,"6,0","2,2","4,0","1,0" +1,"6,1","2,9","4,7","1,4" +1,"5,6","2,9","3,6","1,3" +1,"6,7","3,1","4,4","1,4" +1,"5,6","3,0","4,5","1,5" +1,"5,8","2,7","4,1","1,0" +1,"6,2","2,2","4,5","1,5" +1,"5,6","2,5","3,9","1,1" +1,"5,9","3,2","4,8","1,8" +1,"6,1","2,8","4,0","1,3" +1,"6,3","2,5","4,9","1,5" +1,"6,1","2,8","4,7","1,2" +1,"6,4","2,9","4,3","1,3" +1,"6,6","3,0","4,4","1,4" +1,"6,8","2,8","4,8","1,4" +1,"6,7","3,0","5,0","1,7" +1,"6,0","2,9","4,5","1,5" +1,"5,7","2,6","3,5","1,0" +1,"5,5","2,4","3,8","1,1" +1,"5,5","2,4","3,7","1,0" +1,"5,8","2,7","3,9","1,2" +1,"6,0","2,7","5,1","1,6" +1,"5,4","3,0","4,5","1,5" +1,"6,0","3,4","4,5","1,6" +1,"6,7","3,1","4,7","1,5" +1,"6,3","2,3","4,4","1,3" +1,"5,6","3,0","4,1","1,3" +1,"5,5","2,5","4,0","1,3" +1,"5,5","2,6","4,4","1,2" +1,"6,1","3,0","4,6","1,4" +1,"5,8","2,6","4,0","1,2" +1,"5,0","2,3","3,3","1,0" +1,"5,6","2,7","4,2","1,3" +1,"5,7","3,0","4,2","1,2" +1,"5,7","2,9","4,2","1,3" +1,"6,2","2,9","4,3","1,3" +1,"5,1","2,5","3,0","1,1" +1,"5,7","2,8","4,1","1,3" +2,"6,3","3,3","6,0","2,5" +2,"5,8","2,7","5,1","1,9" +2,"7,1","3,0","5,9","2,1" +2,"6,3","2,9","5,6","1,8" +2,"6,5","3,0","5,8","2,2" +2,"7,6","3,0","6,6","2,1" +2,"4,9","2,5","4,5","1,7" +2,"7,3","2,9","6,3","1,8" +2,"6,7","2,5","5,8","1,8" +2,"7,2","3,6","6,1","2,5" +2,"6,5","3,2","5,1","2,0" +2,"6,4","2,7","5,3","1,9" +2,"6,8","3,0","5,5","2,1" +2,"5,7","2,5","5,0","2,0" +2,"5,8","2,8","5,1","2,4" +2,"6,4","3,2","5,3","2,3" +2,"6,5","3,0","5,5","1,8" +2,"7,7","3,8","6,7","2,2" +2,"7,7","2,6","6,9","2,3" +2,"6,0","2,2","5,0","1,5" +2,"6,9","3,2","5,7","2,3" +2,"5,6","2,8","4,9","2,0" +2,"7,7","2,8","6,7","2,0" +2,"6,3","2,7","4,9","1,8" +2,"6,7","3,3","5,7","2,1" +2,"7,2","3,2","6,0","1,8" +2,"6,2","2,8","4,8","1,8" +2,"6,1","3,0","4,9","1,8" +2,"6,4","2,8","5,6","2,1" +2,"7,2","3,0","5,8","1,6" +2,"7,4","2,8","6,1","1,9" +2,"7,9","3,8","6,4","2,0" +2,"6,4","2,8","5,6","2,2" +2,"6,3","2,8","5,1","1,5" +2,"6,1","2,6","5,6","1,4" +2,"7,7","3,0","6,1","2,3" +2,"6,3","3,4","5,6","2,4" +2,"6,4","3,1","5,5","1,8" +2,"6,0","3,0","4,8","1,8" +2,"6,9","3,1","5,4","2,1" +2,"6,7","3,1","5,6","2,4" +2,"6,9","3,1","5,1","2,3" +2,"5,8","2,7","5,1","1,9" +2,"6,8","3,2","5,9","2,3" +2,"6,7","3,3","5,7","2,5" +2,"6,7","3,0","5,2","2,3" +2,"6,3","2,5","5,0","1,9" +2,"6,5","3,0","5,2","2,0" +2,"6,2","3,4","5,4","2,3" +2,"5,9","3,0","5,1","1,8" From 247af634301e7116481dec8676b8d0dfcd3c557e Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Fri, 22 May 2020 11:01:45 -0700 Subject: [PATCH 09/10] nit fix --- src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 2c64b98a14..6bc58de054 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1229,7 +1229,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo if (options.DecimalMarker != '.' && options.DecimalMarker != ',') throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker); - if (!options.AllowQuoting && options.DecimalMarker == ',' && options.Separator == ",") + if (!options.AllowQuoting && options.DecimalMarker == ',' && _separators.Contains(',')) throw _host.ExceptUserArg(nameof(Options.AllowQuoting), "Quoting must be allowed if decimal marker and separator are the ',' character."); _decimalMarker = options.DecimalMarker; _escapeChar = options.EscapeChar; From 8492472c0ca808ffd7397f37370029e618f0b25a Mon Sep 17 00:00:00 2001 From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com> Date: Fri, 22 May 2020 12:50:35 -0700 Subject: [PATCH 10/10] Compressed tests using --- test/Microsoft.ML.Tests/TextLoaderTests.cs | 143 ++++++--------------- 1 file changed, 40 insertions(+), 103 deletions(-) diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index 9e31ff20f3..a4d44c5cc2 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -843,108 +843,30 @@ public void TestTextLoaderBackCompat_VerWritt_0x0001000C() [Theory] [InlineData(true)] [InlineData(false)] - public void TestCommaAsDecimalMarkerFloat(bool useCsvVersion) + public void TestCommaAsDecimalMarker(bool useCsvVersion) { // When userCsvVersion == false: // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt. - // Here, the features are of type float (Single), and the test checks for decimal markers with floats. // When userCsvVersion == true: // Check to confirm TextLoader can read data from a CSV file where the separator is ',', decimals - // enclosed with quotes, and with the decimal marker being ','. Features are of type float (Single), - // and the test checks for decimal markers with floats. - var mlContext = new MLContext(seed: 1); + // are enclosed with quotes, and with the decimal marker being ','. - string dataPathDecimalMarkerComma; - TextLoader.Options options = new TextLoader.Options() - { - Columns = new[] - { - new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }) - }, - }; - if (useCsvVersion) - { - dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv"); - options.DecimalMarker = ','; - options.Separator = ","; - options.AllowQuoting = true; - options.HasHeader = true; - } - else - { - dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); - options.DecimalMarker = ','; - } - - // Read dataset with comma as decimal marker. - var readerDecimalMarkerComma = new TextLoader(mlContext, options); - var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); - - // Load values from iris database with comma as decimal marker. - DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; - using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); - UInt32 labelComma = default; - ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); - VBuffer featuresComma = default; - ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); - - // Iterate over each row and save labels and features to array for future comparison - int count = 0; - UInt32[] labels = new uint[150]; - float[][] features = new float[150][]; - while (cursorComma.MoveNext()) - { - //Get values from respective columns - labelDelegateComma(ref labelComma); - featuresDelegateComma(ref featuresComma); - labels[count] = labelComma; - features[count] = featuresComma.GetValues().ToArray(); - count++; - } - - // Read dataset with period as decimal marker. - string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt"); - var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options() - { - Columns = new[] - { - new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }), - }, - DecimalMarker = '.' - }); - var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod)); - - // Load values from iris.txt where '.' is the decimal marker. - DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema; - using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod); - UInt32 labelPeriod = default; - ValueGetter labelDelegatePeriod = cursorComma.GetGetter(columnsPeriod[0]); - VBuffer featuresPeriod = default; - ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); - - // Check values from database with ',' as decimal marker with database with '.' as decimal marker. - count = 0; - while (cursorComma.MoveNext()) - { - //Get values from respective columns - labelDelegatePeriod(ref labelPeriod); - featuresDelegatePeriod(ref featuresPeriod); - Assert.Equal(labels[count], labelPeriod); - Assert.Equal(features[count], featuresPeriod.GetValues().ToArray()); - count++; - } + // Do these checks with both float and double as types of features being read, to test decimal marker + // recognition with both doubles and floats. + TestCommaAsDecimalMarkerHelper(useCsvVersion); + TestCommaAsDecimalMarkerHelper(useCsvVersion); } - - [Fact] - public void TestCommaAsDecimalMarkerDouble() + + private void TestCommaAsDecimalMarkerHelper(bool useCsvVersion) { // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt. - // Here, the features are of type double, and the test checks for decimal markers with double. + // Datasets iris.txt and iris-decimal-marker-as-comma.csv have the exact same data, however the .csv + // version has ',' as decimal marker and separator, and feature values are enclosed with quotes. + // T varies as either float or double, so that decimal markers can be tested for both floating + // point value types. var mlContext = new MLContext(seed: 1); // Read dataset with period as decimal marker. @@ -954,7 +876,7 @@ public void TestCommaAsDecimalMarkerDouble() Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }), + new TextLoader.Column("Features", typeof(T) == typeof(double) ? DataKind.Double : DataKind.Single, new [] { new TextLoader.Range(1, 4) }), }, DecimalMarker = '.' }); @@ -965,13 +887,13 @@ public void TestCommaAsDecimalMarkerDouble() using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod); UInt32 labelPeriod = default; ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]); - VBuffer featuresPeriod = default; - ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); + VBuffer featuresPeriod = default; + ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]); // Iterate over each row and save labels and features to array for future comparison int count = 0; UInt32[] labels = new uint[150]; - double[][] features = new double[150][]; + T[][] features = new T[150][]; while (cursorPeriod.MoveNext()) { //Get values from respective columns @@ -983,27 +905,42 @@ public void TestCommaAsDecimalMarkerDouble() } // Read dataset with comma as decimal marker. - string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); - var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options() + // Dataset is either the .csv version or the .txt version. + string dataPathDecimalMarkerComma; + TextLoader.Options options = new TextLoader.Options() { Columns = new[] { new TextLoader.Column("Label", DataKind.UInt32, 0), - new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }), + new TextLoader.Column("Features", typeof(T) == typeof(double) ? DataKind.Double : DataKind.Single, new [] { new TextLoader.Range(1, 4) }) }, - DecimalMarker = ',' - }); + }; + // Set TextLoader.Options for the .csv or .txt cases. + if (useCsvVersion) + { + dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv"); + options.DecimalMarker = ','; + options.Separator = ","; + options.AllowQuoting = true; + options.HasHeader = true; + } + else + { + dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt"); + options.DecimalMarker = ','; + } + var readerDecimalMarkerComma = new TextLoader(mlContext, options); var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma)); - // Load values from iris-decimal-marker-as-comma.txt + // Load values from dataset with comma as decimal marker DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema; using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma); UInt32 labelComma = default; ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]); - VBuffer featuresComma = default; - ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); + VBuffer featuresComma = default; + ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]); - // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt + // Check values from dataset with comma as decimal marker match those in iris.txt (period decimal marker) count = 0; while (cursorComma.MoveNext()) {