From d375d908c7858d04b410750b9d5165e163c3609e Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Tue, 19 May 2020 19:49:44 -0700
Subject: [PATCH 01/10] Added decimal marker option in TextLoader
---
.../DataLoadSave/Text/TextLoader.cs | 21 ++-
.../Text/TextLoaderSaverCatalog.cs | 10 +-
.../Common/EntryPoints/core_manifest.json | 12 ++
test/data/iris_decimal_marker_as_comma.txt | 151 ++++++++++++++++++
4 files changed, 190 insertions(+), 4 deletions(-)
create mode 100644 test/data/iris_decimal_marker_as_comma.txt
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 7ea6ab17e9..0d51a16be2 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -474,6 +474,12 @@ public class Options
[Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")]
public char[] Separators = new[] { Defaults.Separator };
+ ///
+ /// The character that should be used as the decimal marker.
+ ///
+ [Argument(ArgumentType.AtMostOnce, Name = "Decimal Marker", HelpText = "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.", ShortName = "decimal")]
+ public char DecimalMarker = Defaults.DecimalMarker;
+
///
/// Specifies the input columns that should be mapped to columns.
///
@@ -535,6 +541,7 @@ internal static class Defaults
internal const bool AllowQuoting = false;
internal const bool AllowSparse = false;
internal const char Separator = '\t';
+ internal const char DecimalMarker = '.';
internal const bool HasHeader = false;
internal const bool TrimWhitespace = false;
internal const bool ReadMultilines = false;
@@ -1063,7 +1070,8 @@ private static VersionInfo GetVersionInfo()
// verWrittenCur: 0x00010009, // Introduced _flags
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
- verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
+ //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
+ verWrittenCur: 0x0001000D, // Added decimal marker option to allow for ',' to be a decimal marker
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
@@ -1094,6 +1102,7 @@ private enum OptionFlags : uint
// Input size is zero for unknown - determined by the data (including sparse rows).
private readonly int _inputSize;
private readonly char[] _separators;
+ private readonly char _decimalMarker;
private readonly Bindings _bindings;
private readonly Parser _parser;
@@ -1210,6 +1219,9 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
}
}
+ if (options.DecimalMarker == ',' && _separators.Contains(','))
+ throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker);
+ _decimalMarker = options.DecimalMarker;
_bindings = new Bindings(this, cols, headerFile, dataSample);
_parser = new Parser(this);
}
@@ -1373,6 +1385,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
// int: inputSize: 0 for determined from data
// int: number of separators
// char[]: separators
+ // char: decimal marker
// bindings
int cbFloat = ctx.Reader.ReadInt32();
host.CheckDecode(cbFloat == sizeof(float));
@@ -1397,6 +1410,8 @@ private TextLoader(IHost host, ModelLoadContext ctx)
if (_separators.Contains(':'))
host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0);
+ _decimalMarker = ctx.Reader.ReadChar();
+ host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ',');
_bindings = new Bindings(ctx, this);
_parser = new Parser(this);
}
@@ -1437,6 +1452,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
// int: inputSize: 0 for determined from data
// int: number of separators
// char[]: separators
+ // char: decimal marker
// bindings
ctx.Writer.Write(sizeof(float));
ctx.Writer.Write(_maxRows);
@@ -1445,6 +1461,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
_host.Assert(0 <= _inputSize && _inputSize < SrcLim);
ctx.Writer.Write(_inputSize);
ctx.Writer.WriteCharArray(_separators);
+ ctx.Writer.Write(_decimalMarker);
_bindings.Save(ctx);
}
@@ -1470,6 +1487,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
internal static TextLoader CreateTextLoader(IHostEnvironment host,
bool hasHeader = Defaults.HasHeader,
char separator = Defaults.Separator,
+ char decimalMarker = Defaults.DecimalMarker,
bool allowQuoting = Defaults.AllowQuoting,
bool supportSparse = Defaults.AllowSparse,
bool trimWhitespace = Defaults.TrimWhitespace,
@@ -1479,6 +1497,7 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host,
{
HasHeader = hasHeader,
Separators = new[] { separator },
+ DecimalMarker = decimalMarker,
AllowQuoting = allowQuoting,
AllowSparse = supportSparse,
TrimWhitespace = trimWhitespace
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
index 0fcc23fcef..d230f360ba 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
@@ -93,6 +93,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// names and their data types in the schema of the loaded data.
/// The catalog.
/// Column separator character. Default is '\t'
+ /// Decimal separator character. Default is '.'
/// Whether the file has a header with feature names. When a is provided,
/// indicates that the first line in the will be used for feature names, and that when
/// is called, the first line will be skipped. When there is no provided, just indicates that the loader should
@@ -111,13 +112,14 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
char separatorChar = TextLoader.Defaults.Separator,
+ char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
bool allowSparse = TextLoader.Defaults.AllowSparse)
- => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting,
- allowSparse, trimWhitespace, dataSample: dataSample);
+ => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, decimalChar,
+ allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample);
///
/// Create a text loader by inferencing the dataset schema from a data model type.
@@ -221,6 +223,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str
/// The catalog.
/// The path to the file.
/// Column separator character. Default is '\t'
+ /// Decimal separator character. Default is '.'
/// Whether the file has a header. When , the loader will skip the first line when
/// is called.
/// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
@@ -240,6 +243,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str
public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
string path,
char separatorChar = TextLoader.Defaults.Separator,
+ char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
@@ -254,7 +258,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
return TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar,
- allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path));
+ decimalChar, allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path));
}
///
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index 67033afde3..4dc8b278c3 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -369,6 +369,18 @@
"\t"
]
},
+ {
+ "Name": "Decimal Marker",
+ "Type": "Char",
+ "Desc": "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.",
+ "Aliases": [
+ "decimal"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "."
+ },
{
"Name": "TrimWhitespace",
"Type": "Bool",
diff --git a/test/data/iris_decimal_marker_as_comma.txt b/test/data/iris_decimal_marker_as_comma.txt
new file mode 100644
index 0000000000..d9f3b06b4a
--- /dev/null
+++ b/test/data/iris_decimal_marker_as_comma.txt
@@ -0,0 +1,151 @@
+#Label Sepal length Sepal width Petal length Petal width
+0 5,1 3,5 1,4 0,2
+0 4,9 3,0 1,4 0,2
+0 4,7 3,2 1,3 0,2
+0 4,6 3,1 1,5 0,2
+0 5,0 3,6 1,4 0,2
+0 5,4 3,9 1,7 0,4
+0 4,6 3,4 1,4 0,3
+0 5,0 3,4 1,5 0,2
+0 4,4 2,9 1,4 0,2
+0 4,9 3,1 1,5 0,1
+0 5,4 3,7 1,5 0,2
+0 4,8 3,4 1,6 0,2
+0 4,8 3,0 1,4 0,1
+0 4,3 3,0 1,1 0,1
+0 5,8 4,0 1,2 0,2
+0 5,7 4,4 1,5 0,4
+0 5,4 3,9 1,3 0,4
+0 5,1 3,5 1,4 0,3
+0 5,7 3,8 1,7 0,3
+0 5,1 3,8 1,5 0,3
+0 5,4 3,4 1,7 0,2
+0 5,1 3,7 1,5 0,4
+0 4,6 3,6 1,0 0,2
+0 5,1 3,3 1,7 0,5
+0 4,8 3,4 1,9 0,2
+0 5,0 3,0 1,6 0,2
+0 5,0 3,4 1,6 0,4
+0 5,2 3,5 1,5 0,2
+0 5,2 3,4 1,4 0,2
+0 4,7 3,2 1,6 0,2
+0 4,8 3,1 1,6 0,2
+0 5,4 3,4 1,5 0,4
+0 5,2 4,1 1,5 0,1
+0 5,5 4,2 1,4 0,2
+0 4,9 3,1 1,5 0,1
+0 5,0 3,2 1,2 0,2
+0 5,5 3,5 1,3 0,2
+0 4,9 3,1 1,5 0,1
+0 4,4 3,0 1,3 0,2
+0 5,1 3,4 1,5 0,2
+0 5,0 3,5 1,3 0,3
+0 4,5 2,3 1,3 0,3
+0 4,4 3,2 1,3 0,2
+0 5,0 3,5 1,6 0,6
+0 5,1 3,8 1,9 0,4
+0 4,8 3,0 1,4 0,3
+0 5,1 3,8 1,6 0,2
+0 4,6 3,2 1,4 0,2
+0 5,3 3,7 1,5 0,2
+0 5,0 3,3 1,4 0,2
+1 7,0 3,2 4,7 1,4
+1 6,4 3,2 4,5 1,5
+1 6,9 3,1 4,9 1,5
+1 5,5 2,3 4,0 1,3
+1 6,5 2,8 4,6 1,5
+1 5,7 2,8 4,5 1,3
+1 6,3 3,3 4,7 1,6
+1 4,9 2,4 3,3 1,0
+1 6,6 2,9 4,6 1,3
+1 5,2 2,7 3,9 1,4
+1 5,0 2,0 3,5 1,0
+1 5,9 3,0 4,2 1,5
+1 6,0 2,2 4,0 1,0
+1 6,1 2,9 4,7 1,4
+1 5,6 2,9 3,6 1,3
+1 6,7 3,1 4,4 1,4
+1 5,6 3,0 4,5 1,5
+1 5,8 2,7 4,1 1,0
+1 6,2 2,2 4,5 1,5
+1 5,6 2,5 3,9 1,1
+1 5,9 3,2 4,8 1,8
+1 6,1 2,8 4,0 1,3
+1 6,3 2,5 4,9 1,5
+1 6,1 2,8 4,7 1,2
+1 6,4 2,9 4,3 1,3
+1 6,6 3,0 4,4 1,4
+1 6,8 2,8 4,8 1,4
+1 6,7 3,0 5,0 1,7
+1 6,0 2,9 4,5 1,5
+1 5,7 2,6 3,5 1,0
+1 5,5 2,4 3,8 1,1
+1 5,5 2,4 3,7 1,0
+1 5,8 2,7 3,9 1,2
+1 6,0 2,7 5,1 1,6
+1 5,4 3,0 4,5 1,5
+1 6,0 3,4 4,5 1,6
+1 6,7 3,1 4,7 1,5
+1 6,3 2,3 4,4 1,3
+1 5,6 3,0 4,1 1,3
+1 5,5 2,5 4,0 1,3
+1 5,5 2,6 4,4 1,2
+1 6,1 3,0 4,6 1,4
+1 5,8 2,6 4,0 1,2
+1 5,0 2,3 3,3 1,0
+1 5,6 2,7 4,2 1,3
+1 5,7 3,0 4,2 1,2
+1 5,7 2,9 4,2 1,3
+1 6,2 2,9 4,3 1,3
+1 5,1 2,5 3,0 1,1
+1 5,7 2,8 4,1 1,3
+2 6,3 3,3 6,0 2,5
+2 5,8 2,7 5,1 1,9
+2 7,1 3,0 5,9 2,1
+2 6,3 2,9 5,6 1,8
+2 6,5 3,0 5,8 2,2
+2 7,6 3,0 6,6 2,1
+2 4,9 2,5 4,5 1,7
+2 7,3 2,9 6,3 1,8
+2 6,7 2,5 5,8 1,8
+2 7,2 3,6 6,1 2,5
+2 6,5 3,2 5,1 2,0
+2 6,4 2,7 5,3 1,9
+2 6,8 3,0 5,5 2,1
+2 5,7 2,5 5,0 2,0
+2 5,8 2,8 5,1 2,4
+2 6,4 3,2 5,3 2,3
+2 6,5 3,0 5,5 1,8
+2 7,7 3,8 6,7 2,2
+2 7,7 2,6 6,9 2,3
+2 6,0 2,2 5,0 1,5
+2 6,9 3,2 5,7 2,3
+2 5,6 2,8 4,9 2,0
+2 7,7 2,8 6,7 2,0
+2 6,3 2,7 4,9 1,8
+2 6,7 3,3 5,7 2,1
+2 7,2 3,2 6,0 1,8
+2 6,2 2,8 4,8 1,8
+2 6,1 3,0 4,9 1,8
+2 6,4 2,8 5,6 2,1
+2 7,2 3,0 5,8 1,6
+2 7,4 2,8 6,1 1,9
+2 7,9 3,8 6,4 2,0
+2 6,4 2,8 5,6 2,2
+2 6,3 2,8 5,1 1,5
+2 6,1 2,6 5,6 1,4
+2 7,7 3,0 6,1 2,3
+2 6,3 3,4 5,6 2,4
+2 6,4 3,1 5,5 1,8
+2 6,0 3,0 4,8 1,8
+2 6,9 3,1 5,4 2,1
+2 6,7 3,1 5,6 2,4
+2 6,9 3,1 5,1 2,3
+2 5,8 2,7 5,1 1,9
+2 6,8 3,2 5,9 2,3
+2 6,7 3,3 5,7 2,5
+2 6,7 3,0 5,2 2,3
+2 6,3 2,5 5,0 1,9
+2 6,5 3,0 5,2 2,0
+2 6,2 3,4 5,4 2,3
+2 5,9 3,0 5,1 1,8
From 544dab6122d84b763854231764636131ded75fbe Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Tue, 19 May 2020 21:11:18 -0700
Subject: [PATCH 02/10] Added decimalChar to more TextLoader constructors
---
.../DataLoadSave/Text/TextLoaderSaverCatalog.cs | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
index d230f360ba..5471a2f6c9 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
@@ -21,6 +21,7 @@ public static class TextLoaderSaverCatalog
/// The catalog.
/// Array of columns defining the schema.
/// The character used as separator between data points in a row. By default the tab character is used as separator.
+ /// Decimal separator character. Default is '.'
/// Whether the file has a header with feature names. When a is provided,
/// indicates that the first line in the will be used for feature names, and that when
/// is called, the first line will be skipped. When there is no provided, just indicates that the loader should
@@ -51,6 +52,7 @@ public static class TextLoaderSaverCatalog
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
+ char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
@@ -61,6 +63,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
{
Columns = columns,
Separators = new[] { separatorChar },
+ DecimalMarker = decimalChar,
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
@@ -142,6 +145,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat
/// The path to the file.
/// The columns of the schema.
/// The character used as separator between data points in a row. By default the tab character is used as separator.
+ /// Decimal separator character. Default is '.'
/// Whether the file has a header. When , the loader will skip the first line when
/// is called.
/// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
@@ -162,6 +166,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
string path,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
+ char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
@@ -177,6 +182,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
{
Columns = columns,
Separators = new[] { separatorChar },
+ DecimalMarker = decimalChar,
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
From a8a9b54b0a05708f988db5366a20c11114fe410e Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Tue, 19 May 2020 22:31:35 -0700
Subject: [PATCH 03/10] Removed decimalMarker from TextLoader constructors due
to API breaking
---
.../DataLoadSave/Text/TextLoader.cs | 4 ++--
.../DataLoadSave/Text/TextLoaderSaverCatalog.cs | 16 +++-------------
2 files changed, 5 insertions(+), 15 deletions(-)
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 0d51a16be2..99cb5db826 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -1487,11 +1487,11 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
internal static TextLoader CreateTextLoader(IHostEnvironment host,
bool hasHeader = Defaults.HasHeader,
char separator = Defaults.Separator,
- char decimalMarker = Defaults.DecimalMarker,
bool allowQuoting = Defaults.AllowQuoting,
bool supportSparse = Defaults.AllowSparse,
bool trimWhitespace = Defaults.TrimWhitespace,
- IMultiStreamSource dataSample = null)
+ IMultiStreamSource dataSample = null,
+ char decimalMarker = Defaults.DecimalMarker)
{
Options options = new Options
{
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
index 5471a2f6c9..0fcc23fcef 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
@@ -21,7 +21,6 @@ public static class TextLoaderSaverCatalog
/// The catalog.
/// Array of columns defining the schema.
/// The character used as separator between data points in a row. By default the tab character is used as separator.
- /// Decimal separator character. Default is '.'
/// Whether the file has a header with feature names. When a is provided,
/// indicates that the first line in the will be used for feature names, and that when
/// is called, the first line will be skipped. When there is no provided, just indicates that the loader should
@@ -52,7 +51,6 @@ public static class TextLoaderSaverCatalog
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
- char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
@@ -63,7 +61,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
{
Columns = columns,
Separators = new[] { separatorChar },
- DecimalMarker = decimalChar,
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
@@ -96,7 +93,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// names and their data types in the schema of the loaded data.
/// The catalog.
/// Column separator character. Default is '\t'
- /// Decimal separator character. Default is '.'
/// Whether the file has a header with feature names. When a is provided,
/// indicates that the first line in the will be used for feature names, and that when
/// is called, the first line will be skipped. When there is no provided, just indicates that the loader should
@@ -115,14 +111,13 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// columns with values 0, 0, 6, 0, and 3. The indices of the sparse columns start from 0, even though 0 represents the third column.
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
char separatorChar = TextLoader.Defaults.Separator,
- char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
bool allowSparse = TextLoader.Defaults.AllowSparse)
- => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, decimalChar,
- allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample);
+ => TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting,
+ allowSparse, trimWhitespace, dataSample: dataSample);
///
/// Create a text loader by inferencing the dataset schema from a data model type.
@@ -145,7 +140,6 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat
/// The path to the file.
/// The columns of the schema.
/// The character used as separator between data points in a row. By default the tab character is used as separator.
- /// Decimal separator character. Default is '.'
/// Whether the file has a header. When , the loader will skip the first line when
/// is called.
/// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
@@ -166,7 +160,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
string path,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
- char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
@@ -182,7 +175,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
{
Columns = columns,
Separators = new[] { separatorChar },
- DecimalMarker = decimalChar,
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
@@ -229,7 +221,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str
/// The catalog.
/// The path to the file.
/// Column separator character. Default is '\t'
- /// Decimal separator character. Default is '.'
/// Whether the file has a header. When , the loader will skip the first line when
/// is called.
/// Whether the input may include double-quoted values. This parameter is used to distinguish separator characters
@@ -249,7 +240,6 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str
public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog,
string path,
char separatorChar = TextLoader.Defaults.Separator,
- char decimalChar = TextLoader.Defaults.DecimalMarker,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
@@ -264,7 +254,7 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
return TextLoader.CreateTextLoader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar,
- decimalChar, allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path));
+ allowQuoting, allowSparse, trimWhitespace).Load(new MultiFileSource(path));
}
///
From 7658a70887bdda3c59bef9f5ae30b27175c00b9b Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Wed, 20 May 2020 14:08:28 -0700
Subject: [PATCH 04/10] Added unit test for ',' as a decimal marker, and added
decimalMarker to TextLoaderCursor and TextLoaderParser
---
.../Utilities/DoubleParser.cs | 12 ++++---
.../DataLoadSave/Text/TextLoader.cs | 13 ++++---
.../DataLoadSave/Text/TextLoaderCursor.cs | 12 ++++---
.../DataLoadSave/Text/TextLoaderParser.cs | 2 ++
test/Microsoft.ML.Tests/TextLoaderTests.cs | 35 +++++++++++++++++++
5 files changed, 59 insertions(+), 15 deletions(-)
diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
index bad57cdd3c..18e2c0fd46 100644
--- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
+++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
@@ -527,6 +527,8 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
Contracts.Assert(num == 0);
Contracts.Assert(exp == 0);
+ const char decimalMarker = '.';
+
if (ich >= span.Length)
return false;
@@ -554,7 +556,7 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
return false;
break;
- case '.':
+ case decimalMarker:
goto LPoint;
// The common cases.
@@ -571,7 +573,7 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
break;
}
- // Get digits before '.'
+ // Get digits before the decimal marker, which may be '.' or ','
uint d;
for (; ; )
{
@@ -593,14 +595,14 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
}
Contracts.Assert(i < span.Length);
- if (span[i] != '.')
+ if (span[i] != decimalMarker)
goto LAfterDigits;
LPoint:
Contracts.Assert(i < span.Length);
- Contracts.Assert(span[i] == '.');
+ Contracts.Assert(span[i] == decimalMarker);
- // Get the digits after '.'
+ // Get the digits after the decimal marker, which may be '.' or ','
for (; ; )
{
if (++i >= span.Length)
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 99cb5db826..0d0b2c179c 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -709,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List> lines = null;
if (headerFile != null)
- Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
+ Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
if (needInputSize && inputSize == 0)
- Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
+ Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
else if (headerFile == null && parent.HasHeader)
- Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);
+ Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
if (needInputSize && inputSize == 0)
{
@@ -1410,8 +1410,11 @@ private TextLoader(IHost host, ModelLoadContext ctx)
if (_separators.Contains(':'))
host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0);
- _decimalMarker = ctx.Reader.ReadChar();
- host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ',');
+ if (ctx.Header.ModelVerWritten >= 0x0001000D)
+ {
+ _decimalMarker = ctx.Reader.ReadChar();
+ host.CheckDecode(_decimalMarker == '.' || _decimalMarker == ',');
+ }
_bindings = new Bindings(ctx, this);
_parser = new Parser(this);
}
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
index 62f5709169..f2ff6ec4a1 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
@@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);
- var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
+ var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1, parent._decimalMarker);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
@@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);
- var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
+ var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd, parent._decimalMarker);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
@@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter()
};
}
- public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines)
+ public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines, char decimalMarker)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
@@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM
count = 2;
LineBatch batch;
- var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
+ var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1, decimalMarker);
try
{
batch = reader.GetBatch();
@@ -404,6 +404,7 @@ private sealed class LineReader
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly char[] _separators;
+ private readonly char _decimalMarker;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;
@@ -413,7 +414,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;
- public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
+ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref, char decimalMarker)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
@@ -430,6 +431,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_separators = separators;
_files = files;
_cref = cref;
+ _decimalMarker = decimalMarker;
_queue = new BlockingQueue(bufSize);
_thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc);
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
index 13019c4bf2..c84cb1927c 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
@@ -633,6 +633,7 @@ public void Clear()
}
private readonly char[] _separators;
+ private readonly char _decimalMarker;
private readonly OptionFlags _flags;
private readonly int _inputSize;
private readonly ColInfo[] _infos;
@@ -683,6 +684,7 @@ public Parser(TextLoader parent)
}
_separators = parent._separators;
+ _decimalMarker = parent._decimalMarker;
_flags = parent._flags;
_inputSize = parent._inputSize;
Contracts.Assert(_inputSize >= 0);
diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs
index b2421bacce..e249808fda 100644
--- a/test/Microsoft.ML.Tests/TextLoaderTests.cs
+++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs
@@ -803,6 +803,41 @@ public void TestTextLoaderKeyTypeBackCompat()
}
}
+ [Fact]
+ public void TestCommaAsDecimalMarker()
+ {
+ string dataPath = GetDataPath("iris_decimal_marker_as_comma.txt");
+
+ // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
+ // as a catalog of available operations and as the source of randomness.
+ var mlContext = new MLContext(seed: 1);
+ var reader = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.Single, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = ','
+ });
+ // Data
+ var textData = reader.Load(GetDataPath(dataPath));
+ var data = mlContext.Data.Cache(mlContext.Transforms.Conversion.MapValueToKey("Label")
+ .Fit(textData).Transform(textData));
+
+ // Pipeline
+ var pipeline = mlContext.MulticlassClassification.Trainers.OneVersusAll(
+ mlContext.BinaryClassification.Trainers.LinearSvm(new Trainers.LinearSvmTrainer.Options { NumberOfIterations = 100 }),
+ useProbabilities: false);
+
+ var model = pipeline.Fit(data);
+ var predictions = model.Transform(data);
+
+ // Metrics
+ var metrics = mlContext.MulticlassClassification.Evaluate(predictions);
+ Assert.True(metrics.MicroAccuracy > 0.83);
+ }
+
private class IrisNoFields
{
}
From ece551852a2804a9a70e5ddd1a2bc4b6b6139577 Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Wed, 20 May 2020 17:33:15 -0700
Subject: [PATCH 05/10] Added DecimalMarker in DoubleParser
---
src/Microsoft.ML.Core/Utilities/DoubleParser.cs | 16 +++++++++++-----
.../DataLoadSave/Text/TextLoader.cs | 10 ++++++----
.../DataLoadSave/Text/TextLoaderCursor.cs | 12 +++++-------
.../DataLoadSave/Text/TextLoaderParser.cs | 2 --
4 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
index 18e2c0fd46..1740bd3daa 100644
--- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
+++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
@@ -17,6 +17,13 @@ internal static class DoubleParser
private const ulong TopThreeBits = 0xE000000000000000UL;
private const char InfinitySymbol = '\u221E';
+ // The decimal marker that separates the integer part from the fractional part of a number
+ // written in decimal from can vary across different cultures as either '.' or ','. The
+ // default decimal marker in ML .NET is '.', however through this static char variable,
+ // we allow users to specify the decimal marker used in their datasets as ',' as well.
+ [BestFriend]
+ internal static char DecimalMarker = '.';
+
// REVIEW: casting ulong to Double doesn't always do the right thing, for example
// with 0x84595161401484A0UL. Hence the gymnastics several places in this code. Note that
// long to Double does work. The work around is:
@@ -527,8 +534,6 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
Contracts.Assert(num == 0);
Contracts.Assert(exp == 0);
- const char decimalMarker = '.';
-
if (ich >= span.Length)
return false;
@@ -556,7 +561,8 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
return false;
break;
- case decimalMarker:
+ case '.':
+ case ',':
goto LPoint;
// The common cases.
@@ -595,12 +601,12 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
}
Contracts.Assert(i < span.Length);
- if (span[i] != decimalMarker)
+ if (span[i] != DecimalMarker)
goto LAfterDigits;
LPoint:
Contracts.Assert(i < span.Length);
- Contracts.Assert(span[i] == decimalMarker);
+ Contracts.Assert(span[i] == DecimalMarker);
// Get the digits after the decimal marker, which may be '.' or ','
for (; ; )
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 0d0b2c179c..b69038a4d7 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -709,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List> lines = null;
if (headerFile != null)
- Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
+ Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
if (needInputSize && inputSize == 0)
- Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
+ Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
else if (headerFile == null && parent.HasHeader)
- Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines, parent._decimalMarker);
+ Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);
if (needInputSize && inputSize == 0)
{
@@ -1219,7 +1219,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
}
}
- if (options.DecimalMarker == ',' && _separators.Contains(','))
+ if (_separators.Contains(options.DecimalMarker))
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker);
_decimalMarker = options.DecimalMarker;
_bindings = new Bindings(this, cols, headerFile, dataSample);
@@ -1607,6 +1607,7 @@ public BoundLoader(TextLoader loader, IMultiStreamSource files)
public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null)
{
_host.CheckValueOrNull(rand);
+ DoubleParser.DecimalMarker = _loader._decimalMarker;
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
return Cursor.Create(_loader, _files, active);
}
@@ -1614,6 +1615,7 @@ public DataViewRowCursor GetRowCursor(IEnumerable columns
public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null)
{
_host.CheckValueOrNull(rand);
+ DoubleParser.DecimalMarker = _loader._decimalMarker;
var active = Utils.BuildArray(_loader._bindings.OutputSchema.Count, columnsNeeded);
return Cursor.CreateSet(_loader, _files, active, n);
}
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
index f2ff6ec4a1..62f5709169 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
@@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);
- var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1, parent._decimalMarker);
+ var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
@@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);
- var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd, parent._decimalMarker);
+ var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
@@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter()
};
}
- public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines, char decimalMarker)
+ public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
@@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM
count = 2;
LineBatch batch;
- var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1, decimalMarker);
+ var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
try
{
batch = reader.GetBatch();
@@ -404,7 +404,6 @@ private sealed class LineReader
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly char[] _separators;
- private readonly char _decimalMarker;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;
@@ -414,7 +413,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;
- public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref, char decimalMarker)
+ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
@@ -431,7 +430,6 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_separators = separators;
_files = files;
_cref = cref;
- _decimalMarker = decimalMarker;
_queue = new BlockingQueue(bufSize);
_thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc);
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
index c84cb1927c..13019c4bf2 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
@@ -633,7 +633,6 @@ public void Clear()
}
private readonly char[] _separators;
- private readonly char _decimalMarker;
private readonly OptionFlags _flags;
private readonly int _inputSize;
private readonly ColInfo[] _infos;
@@ -684,7 +683,6 @@ public Parser(TextLoader parent)
}
_separators = parent._separators;
- _decimalMarker = parent._decimalMarker;
_flags = parent._flags;
_inputSize = parent._inputSize;
Contracts.Assert(_inputSize >= 0);
From a663f210db833e1d48654c86ce0ff753230d9447 Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Wed, 20 May 2020 17:48:33 -0700
Subject: [PATCH 06/10] Added decimal marker check and removed decimalMarker
from CreateTextLoader's constructor
---
src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index b69038a4d7..9b5c50184e 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -1219,6 +1219,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
}
}
+ if (options.DecimalMarker != '.' && options.DecimalMarker != ',')
+ throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker);
if (_separators.Contains(options.DecimalMarker))
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker);
_decimalMarker = options.DecimalMarker;
@@ -1493,14 +1495,12 @@ internal static TextLoader CreateTextLoader(IHostEnvironment host,
bool allowQuoting = Defaults.AllowQuoting,
bool supportSparse = Defaults.AllowSparse,
bool trimWhitespace = Defaults.TrimWhitespace,
- IMultiStreamSource dataSample = null,
- char decimalMarker = Defaults.DecimalMarker)
+ IMultiStreamSource dataSample = null)
{
Options options = new Options
{
HasHeader = hasHeader,
Separators = new[] { separator },
- DecimalMarker = decimalMarker,
AllowQuoting = allowQuoting,
AllowSparse = supportSparse,
TrimWhitespace = trimWhitespace
From 141fa7be65febd22e87b97723c2ba9e447972648 Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Thu, 21 May 2020 18:30:13 -0700
Subject: [PATCH 07/10] Added TextLoader decimalMarker unit tests, and refined
logic in DoubleParser
---
.../Utilities/DoubleParser.cs | 13 +
.../DataLoadSave/Text/TextLoader.cs | 4 +-
test/Microsoft.ML.Tests/TextLoaderTests.cs | 340 ++++++++++++++++--
test/data/iris-decimal-marker-as-comma.csv | 151 ++++++++
...a.txt => iris-decimal-marker-as-comma.txt} | 0
5 files changed, 484 insertions(+), 24 deletions(-)
create mode 100644 test/data/iris-decimal-marker-as-comma.csv
rename test/data/{iris_decimal_marker_as_comma.txt => iris-decimal-marker-as-comma.txt} (100%)
diff --git a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
index 1740bd3daa..22bd8ea82e 100644
--- a/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
+++ b/src/Microsoft.ML.Core/Utilities/DoubleParser.cs
@@ -17,6 +17,14 @@ internal static class DoubleParser
private const ulong TopThreeBits = 0xE000000000000000UL;
private const char InfinitySymbol = '\u221E';
+ // Note for future development: DoubleParser is a static class and DecimalMarker is a
+ // static variable, which means only one instance of these can exist at once. As such,
+ // the value of DecimalMarker cannot vary when datasets with differing decimal markers
+ // are loaded together at once, which would result in not being able to accurately read
+ // the dataset with the differing decimal marker. Although this edge case where we attempt
+ // to load in datasets with different decimal markers at once is unlikely to occur, we
+ // should still be aware of this and plan to fix it in the future.
+
// The decimal marker that separates the integer part from the fractional part of a number
// written in decimal from can vary across different cultures as either '.' or ','. The
// default decimal marker in ML .NET is '.', however through this static char variable,
@@ -562,7 +570,12 @@ private static bool TryParseCore(ReadOnlySpan span, ref int ich, ref bool
break;
case '.':
+ if (DecimalMarker != '.') // Decimal marker was not '.', but we encountered a '.', which must be an error.
+ return false; // Since this was an error, return false, which will later make the caller to set NaN as the out value.
+ goto LPoint;
case ',':
+ if (DecimalMarker != ',') // Same logic as above.
+ return false;
goto LPoint;
// The common cases.
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 9b5c50184e..83438290c9 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -1071,7 +1071,7 @@ private static VersionInfo GetVersionInfo()
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
//verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
- verWrittenCur: 0x0001000D, // Added decimal marker option to allow for ',' to be a decimal marker
+ verWrittenCur: 0x0001000D, // Added escapeChar option and decimal marker option to allow for ',' to be a decimal marker
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
@@ -1221,8 +1221,6 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
if (options.DecimalMarker != '.' && options.DecimalMarker != ',')
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker);
- if (_separators.Contains(options.DecimalMarker))
- throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker and separator cannot be the same '{0}' character.", options.DecimalMarker);
_decimalMarker = options.DecimalMarker;
_bindings = new Bindings(this, cols, headerFile, dataSample);
_parser = new Parser(this);
diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs
index e249808fda..201d8ae678 100644
--- a/test/Microsoft.ML.Tests/TextLoaderTests.cs
+++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs
@@ -804,38 +804,336 @@ public void TestTextLoaderKeyTypeBackCompat()
}
[Fact]
- public void TestCommaAsDecimalMarker()
+ public void TestTextLoaderBackCompat_VerWritt_0x0001000C()
{
- string dataPath = GetDataPath("iris_decimal_marker_as_comma.txt");
+ // Checks backward compatibility with a text loader created with "verWrittenCur: 0x0001000C"
+ // Model generated with:
+ // loader=text{header+ col=SepalLength:Num:0 col=SepalWidth:Num:1 col=PetalLength:Num:2 col=PetalWidth:Num:2 col=Cat:TX:1-8 col=Num:9-14 col=Type:TX:4}
+ var mlContext = new MLContext(1);
+ string textLoaderModelPath = GetDataPath("backcompat/textloader_VerWritt_0x0001000C.zip");
+ string irisPath = GetDataPath(TestDatasets.irisData.trainFilename);
+
+ IDataView iris;
+ using (FileStream modelfs = File.OpenRead(textLoaderModelPath))
+ using (var rep = RepositoryReader.Open(modelfs, mlContext))
+ {
+ iris = ModelFileUtils.LoadLoader(mlContext, rep, new MultiFileSource(irisPath), false);
+ }
+
+ var previewIris = iris.Preview(1);
+ var irisFirstRow = new Dictionary();
+ irisFirstRow["SepalLength"] = 5.1f;
+ irisFirstRow["SepalWidth"] = 3.5f;
+ irisFirstRow["PetalLength"] = 1.4f;
+ irisFirstRow["PetalWidth"] = 0.2f;
+
+ Assert.Equal(5, previewIris.ColumnView.Length);
+ Assert.Equal("SepalLength", previewIris.Schema[0].Name);
+ Assert.Equal(NumberDataViewType.Single, previewIris.Schema[0].Type);
+ int index = 0;
+ foreach (var entry in irisFirstRow)
+ {
+ Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key);
+ Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value);
+ }
+ Assert.Equal("Type", previewIris.RowView[0].Values[index].Key);
+ Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString());
+ }
+
+ [Fact]
+ public void TestCommaAsDecimalMarkerFloat()
+ {
+ // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
+ // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
+ // Here, the features are of type float (Single), and the test checks for decimal markers with floats.
+ var mlContext = new MLContext(seed: 1);
+
+ UInt32[] labels = new uint[150];
+ float[][] features = new float[150][];
+
+ // Read dataset with period as decimal marker.
+ string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
+ var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = '.'
+ });
+ var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod));
+
+ // Load values from iris.txt
+ DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema;
+ using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
+ UInt32 labelPeriod = default;
+ ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]);
+ VBuffer featuresPeriod = default;
+ ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
+
+ // Iterate over each row and save labels and features to array for future comparison
+ int count = 0;
+ while (cursorPeriod.MoveNext())
+ {
+ //Get values from respective columns
+ labelDelegatePeriod(ref labelPeriod);
+ featuresDelegatePeriod(ref featuresPeriod);
+ labels[count] = labelPeriod;
+ features[count] = featuresPeriod.GetValues().ToArray();
+ count++;
+ }
+
+ // Read dataset with comma as decimal marker.
+ string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
+ var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = ','
+ });
+ var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
+
+ // Load values from iris-decimal-marker-as-comma.txt
+ DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
+ using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
+ UInt32 labelComma = default;
+ ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
+ VBuffer featuresComma = default;
+ ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
+
+ // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt
+ count = 0;
+ while (cursorComma.MoveNext())
+ {
+ //Get values from respective columns
+ labelDelegateComma(ref labelComma);
+ featuresDelegateComma(ref featuresComma);
+ Assert.Equal(labels[count], labelComma);
+ Assert.Equal(features[count], featuresComma.GetValues().ToArray());
+ count++;
+ }
+ }
+
+ [Fact]
+ public void TestCommaAsDecimalMarkerDouble()
+ {
+ // Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
+ // decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
+ // Here, the features are of type double, and the test checks for decimal markers with double.
+ var mlContext = new MLContext(seed: 1);
+
+ // Read dataset with period as decimal marker.
+ string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
+ var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = '.'
+ });
+ var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod));
+
+ // Load values from iris.txt
+ DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema;
+ using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
+ UInt32 labelPeriod = default;
+ ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]);
+ VBuffer featuresPeriod = default;
+ ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
+
+ UInt32[] labels = new uint[150];
+ double[][] features = new double[150][];
+
+ // Iterate over each row and save labels and features to array for future comparison
+ int count = 0;
+ while (cursorPeriod.MoveNext())
+ {
+ //Get values from respective columns
+ labelDelegatePeriod(ref labelPeriod);
+ featuresDelegatePeriod(ref featuresPeriod);
+ labels[count] = labelPeriod;
+ features[count] = featuresPeriod.GetValues().ToArray();
+ count++;
+ }
+
+ // Read dataset with comma as decimal marker.
+ string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
+ var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = ','
+ });
+ var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
+
+ // Load values from iris-decimal-marker-as-comma.txt
+ DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
+ using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
+ UInt32 labelComma = default;
+ ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
+ VBuffer featuresComma = default;
+ ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
+
+ // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt
+ count = 0;
+ while (cursorComma.MoveNext())
+ {
+ //Get values from respective columns
+ labelDelegateComma(ref labelComma);
+ featuresDelegateComma(ref featuresComma);
+ Assert.Equal(labels[count], labelComma);
+ Assert.Equal(features[count], featuresComma.GetValues().ToArray());
+ count++;
+ }
+ }
+
+ [Fact]
+ public void TestWrongDecimalMarkerInputs()
+ {
+ // When DecimalMarker does not match the actual decimal marker used in the dataset,
+ // we obtain values of NaN. Check that the values are indeed NaN in this case.
+ var mlContext = new MLContext(seed: 1);
+
+ // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ','.
+ string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
+ var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = ','
+ });
+ var textDataMismatched1 = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerPeriod));
+
+ // Check that the features being loaded are NaN.
+ DataViewSchema columnsPeriod = textDataMismatched1.Schema;
+ using DataViewRowCursor cursorPeriod = textDataMismatched1.GetRowCursor(columnsPeriod);
+ VBuffer featuresPeriod = default;
+ ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
+
+ // Iterate over each row
+ while (cursorPeriod.MoveNext())
+ {
+ featuresDelegatePeriod.Invoke(ref featuresPeriod);
+ foreach(float feature in featuresPeriod.GetValues().ToArray())
+ Assert.Equal(feature, Single.NaN);
+ }
+
+ // Try reading a dataset where ',' is the actual decimal marker, but DecimalMarker = '.'.
+ string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
+ var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
+ DecimalMarker = '.'
+ });
+ var textDataMismatched2 = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerComma));
+
+ DataViewSchema columnsComma = textDataMismatched2.Schema;
+ using DataViewRowCursor cursorComma = textDataMismatched2.GetRowCursor(columnsComma);
+ VBuffer featuresComma = default;
+ ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
+
+ // Iterate over each row
+ while (cursorComma.MoveNext())
+ {
+ featuresDelegateComma.Invoke(ref featuresComma);
+ foreach (float feature in featuresComma.GetValues().ToArray())
+ Assert.Equal(feature, Single.NaN);
+ }
+ }
+
+ [Fact]
+ public void TestCommaAsDecimalMarkerWithSeperatorAsCommaInCSV()
+ {
+ // Check to confirm TextLoader can read data from a CSV file where the separator is ',' and decimals
+ // enclosed with quotes and with the decimal marker ',' can be successfully read.
+ string dataPathCsv = GetDataPath("iris-decimal-marker-as-comma.csv");
- // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
- // as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext(seed: 1);
- var reader = new TextLoader(mlContext, new TextLoader.Options()
+ var readerCsv = new TextLoader(mlContext, new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column("Label", DataKind.Single, 0),
new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
},
+ DecimalMarker = ',',
+ Separator = ",",
+ AllowQuoting = true,
+ HasHeader = true
+ });
+ var textDataCsv = readerCsv.Load(GetDataPath(dataPathCsv));
+
+ // Load values from iris-decimal-marker-as-comma.csv
+ DataViewSchema columnsCsv = textDataCsv.Schema;
+ using DataViewRowCursor cursorCsv = textDataCsv.GetRowCursor(columnsCsv);
+ UInt32 labelCsv = default;
+ ValueGetter labelDelegatePeriod = cursorCsv.GetGetter(columnsCsv[0]);
+ VBuffer featuresCsv = default;
+ ValueGetter> featuresDelegatePeriod = cursorCsv.GetGetter>(columnsCsv[1]);
+
+ UInt32[] labels = new uint[150];
+ double[][] features = new double[150][];
+
+ // Iterate over each row and save labels and features to array for future comparison
+ int count = 0;
+ while (cursorCsv.MoveNext())
+ {
+ //Get values from respective columns
+ labelDelegatePeriod(ref labelCsv);
+ featuresDelegatePeriod(ref featuresCsv);
+ labels[count] = labelCsv;
+ features[count] = featuresCsv.GetValues().ToArray();
+ count++;
+ }
+
+ // Read dataset with comma as decimal marker.
+ string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
+ var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ {
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.UInt32, 0),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ },
DecimalMarker = ','
});
- // Data
- var textData = reader.Load(GetDataPath(dataPath));
- var data = mlContext.Data.Cache(mlContext.Transforms.Conversion.MapValueToKey("Label")
- .Fit(textData).Transform(textData));
-
- // Pipeline
- var pipeline = mlContext.MulticlassClassification.Trainers.OneVersusAll(
- mlContext.BinaryClassification.Trainers.LinearSvm(new Trainers.LinearSvmTrainer.Options { NumberOfIterations = 100 }),
- useProbabilities: false);
-
- var model = pipeline.Fit(data);
- var predictions = model.Transform(data);
-
- // Metrics
- var metrics = mlContext.MulticlassClassification.Evaluate(predictions);
- Assert.True(metrics.MicroAccuracy > 0.83);
+ var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
+
+ // Load values from iris-decimal-marker-as-comma.txt
+ DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
+ using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
+ UInt32 labelComma = default;
+ ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
+ VBuffer featuresComma = default;
+ ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
+
+ // Check values from iris-decimal-marker-as-comma.txt match those in iris-decimal-marker-as-comma.csv
+ count = 0;
+ while (cursorComma.MoveNext())
+ {
+ //Get values from respective columns
+ labelDelegateComma(ref labelComma);
+ featuresDelegateComma(ref featuresComma);
+ Assert.Equal(labels[count], labelComma);
+ Assert.Equal(features[count], featuresComma.GetValues().ToArray());
+ count++;
+ }
}
private class IrisNoFields
diff --git a/test/data/iris-decimal-marker-as-comma.csv b/test/data/iris-decimal-marker-as-comma.csv
new file mode 100644
index 0000000000..0612dd281e
--- /dev/null
+++ b/test/data/iris-decimal-marker-as-comma.csv
@@ -0,0 +1,151 @@
+Label,Sepal,length,Sepal,width,Petal length,Petal width
+0,"5,1","3,5","1,4","0.2"
+0,"4,9","3,0","1,4","0.2"
+0,"4,7","3,2","1,3","0.2"
+0,"4,6","3,1","1,5","0.2"
+0,"5,0","3,6","1,4","0.2"
+0,"5,4","3,9","1,7","0.4"
+0,"4,6","3,4","1,4","0.3"
+0,"5,0","3,4","1,5","0.2"
+0,"4,4","2,9","1,4","0.2"
+0,"4,9","3,1","1,5","0.1"
+0,"5,4","3,7","1,5","0.2"
+0,"4,8","3,4","1,6","0.2"
+0,"4,8","3,0","1,4","0.1"
+0,"4,3","3,0","1,1","0.1"
+0,"5,8","4,0","1,2","0.2"
+0,"5,7","4,4","1,5","0.4"
+0,"5,4","3,9","1,3","0.4"
+0,"5,1","3,5","1,4","0.3"
+0,"5,7","3,8","1,7","0.3"
+0,"5,1","3,8","1,5","0.3"
+0,"5,4","3,4","1,7","0.2"
+0,"5,1","3,7","1,5","0.4"
+0,"4,6","3,6","1,0","0.2"
+0,"5,1","3,3","1,7","0,5"
+0,"4,8","3,4","1,9","0.2"
+0,"5,0","3,0","1,6","0.2"
+0,"5,0","3,4","1,6","0.4"
+0,"5,2","3,5","1,5","0.2"
+0,"5,2","3,4","1,4","0.2"
+0,"4,7","3,2","1,6","0.2"
+0,"4,8","3,1","1,6","0.2"
+0,"5,4","3,4","1,5","0.4"
+0,"5,2","4,1","1,5","0.1"
+0,"5,5","4,2","1,4","0.2"
+0,"4,9","3,1","1,5","0.1"
+0,"5,0","3,2","1,2","0.2"
+0,"5,5","3,5","1,3","0.2"
+0,"4,9","3,1","1,5","0.1"
+0,"4,4","3,0","1,3","0.2"
+0,"5,1","3,4","1,5","0.2"
+0,"5,0","3,5","1,3","0.3"
+0,"4,5","2,3","1,3","0.3"
+0,"4,4","3,2","1,3","0.2"
+0,"5,0","3,5","1,6","0.6"
+0,"5,1","3,8","1,9","0.4"
+0,"4,8","3,0","1,4","0.3"
+0,"5,1","3,8","1,6","0.2"
+0,"4,6","3,2","1,4","0.2"
+0,"5,3","3,7","1,5","0.2"
+0,"5,0","3,3","1,4","0.2"
+0,"7,0","3,2","4,7","1,4
+0,"6,4","3,2","4,5","1,5
+0,"6,9","3,1","4,9","1,5
+0,"5,5","2,3","4,0","1,3
+0,"6,5","2,8","4,6","1,5
+0,"5,7","2,8","4,5","1,3
+0,"6,3","3,3","4,7","1,6
+0,"4,9","2,4","3,3","1,0
+0,"6,6","2,9","4,6","1,3
+0,"5,2","2,7","3,9","1,4
+0,"5,0","2,0","3,5","1,0
+0,"5,9","3,0","4,2","1,5
+0,"6,0","2,2","4,0","1,0
+0,"6,1","2,9","4,7","1,4
+0,"5,6","2,9","3,6","1,3
+0,"6,7","3,1","4,4","1,4
+0,"5,6","3,0","4,5","1,5
+0,"5,8","2,7","4,1","1,0
+0,"6,2","2,2","4,5","1,5
+0,"5,6","2,5","3,9","1,1
+0,"5,9","3,2","4,8","1,8
+0,"6,1","2,8","4,0","1,3
+0,"6,3","2,5","4,9","1,5
+0,"6,1","2,8","4,7","1,2
+0,"6,4","2,9","4,3","1,3
+0,"6,6","3,0","4,4","1,4
+0,"6,8","2,8","4,8","1,4
+0,"6,7","3,0","5,0","1,7
+0,"6,0","2,9","4,5","1,5
+0,"5,7","2,6","3,5","1,0
+0,"5,5","2,4","3,8","1,1
+0,"5,5","2,4","3,7","1,0
+0,"5,8","2,7","3,9","1,2
+0,"6,0","2,7","5,1","1,6
+0,"5,4","3,0","4,5","1,5
+0,"6,0","3,4","4,5","1,6
+0,"6,7","3,1","4,7","1,5
+0,"6,3","2,3","4,4","1,3
+0,"5,6","3,0","4,1","1,3
+0,"5,5","2,5","4,0","1,3
+0,"5,5","2,6","4,4","1,2
+0,"6,1","3,0","4,6","1,4
+0,"5,8","2,6","4,0","1,2
+0,"5,0","2,3","3,3","1,0
+0,"5,6","2,7","4,2","1,3
+0,"5,7","3,0","4,2","1,2
+0,"5,7","2,9","4,2","1,3
+0,"6,2","2,9","4,3","1,3
+0,"5,1","2,5","3,0","1,1
+0,"5,7","2,8","4,1","1,3
+0,"6,3","3,3","6,0","2,5
+0,"5,8","2,7","5,1","1,9
+0,"7,1","3,0","5,9","2,1
+0,"6,3","2,9","5,6","1,8
+0,"6,5","3,0","5,8","2,2
+0,"7,6","3,0","6,6","2,1
+0,"4,9","2,5","4,5","1,7
+0,"7,3","2,9","6,3","1,8
+0,"6,7","2,5","5,8","1,8
+0,"7,2","3,6","6,1","2,5
+0,"6,5","3,2","5,1","2,0
+0,"6,4","2,7","5,3","1,9
+0,"6,8","3,0","5,5","2,1
+0,"5,7","2,5","5,0","2,0
+0,"5,8","2,8","5,1","2,4
+0,"6,4","3,2","5,3","2,3
+0,"6,5","3,0","5,5","1,8
+0,"7,7","3,8","6,7","2,2
+0,"7,7","2,6","6,9","2,3
+0,"6,0","2,2","5,0","1,5
+0,"6,9","3,2","5,7","2,3
+0,"5,6","2,8","4,9","2,0
+0,"7,7","2,8","6,7","2,0
+0,"6,3","2,7","4,9","1,8
+0,"6,7","3,3","5,7","2,1
+0,"7,2","3,2","6,0","1,8
+0,"6,2","2,8","4,8","1,8
+0,"6,1","3,0","4,9","1,8
+0,"6,4","2,8","5,6","2,1
+0,"7,2","3,0","5,8","1,6
+0,"7,4","2,8","6,1","1,9
+0,"7,9","3,8","6,4","2,0
+0,"6,4","2,8","5,6","2,2
+0,"6,3","2,8","5,1","1,5
+0,"6,1","2,6","5,6","1,4
+0,"7,7","3,0","6,1","2,3
+0,"6,3","3,4","5,6","2,4
+0,"6,4","3,1","5,5","1,8
+0,"6,0","3,0","4,8","1,8
+0,"6,9","3,1","5,4","2,1
+0,"6,7","3,1","5,6","2,4
+0,"6,9","3,1","5,1","2,3
+0,"5,8","2,7","5,1","1,9
+0,"6,8","3,2","5,9","2,3
+0,"6,7","3,3","5,7","2,5
+0,"6,7","3,0","5,2","2,3
+0,"6,3","2,5","5,0","1,9
+0,"6,5","3,0","5,2","2,0
+0,"6,2","3,4","5,4","2,3
+0,"5,9","3,0","5,1","1,8
diff --git a/test/data/iris_decimal_marker_as_comma.txt b/test/data/iris-decimal-marker-as-comma.txt
similarity index 100%
rename from test/data/iris_decimal_marker_as_comma.txt
rename to test/data/iris-decimal-marker-as-comma.txt
From 142f1305065f02dab08f3e4c580719a6e3b72897 Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Thu, 21 May 2020 20:37:02 -0700
Subject: [PATCH 08/10] Refine tests, logic, csv dataset
---
.../DataLoadSave/Text/TextLoader.cs | 5 +-
test/Microsoft.ML.Tests/TextLoaderTests.cs | 259 ++++++---------
test/data/iris-decimal-marker-as-comma.csv | 298 +++++++++---------
3 files changed, 247 insertions(+), 315 deletions(-)
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index b08635ad59..2c64b98a14 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -475,7 +475,7 @@ public class Options
public char[] Separators = new[] { Defaults.Separator };
///
- /// The character that should be used as the decimal marker.
+ /// The character that should be used as the decimal marker. Default value is '.'. Only '.' and ',' are allowed to be decimal markers.
///
[Argument(ArgumentType.AtMostOnce, Name = "Decimal Marker", HelpText = "Character symbol used to separate the integer part from the fractional part of a number written in decimal form.", ShortName = "decimal")]
public char DecimalMarker = Defaults.DecimalMarker;
@@ -1229,6 +1229,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
if (options.DecimalMarker != '.' && options.DecimalMarker != ',')
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker);
+ if (!options.AllowQuoting && options.DecimalMarker == ',' && options.Separator == ",")
+ throw _host.ExceptUserArg(nameof(Options.AllowQuoting), "Quoting must be allowed if decimal marker and separator are the ',' character.");
_decimalMarker = options.DecimalMarker;
_escapeChar = options.EscapeChar;
if(_separators.Contains(_escapeChar))
@@ -1432,6 +1434,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
else
{
_escapeChar = Defaults.EscapeChar;
+ _decimalMarker = Defaults.DecimalMarker;
}
host.CheckDecode(!_separators.Contains(_escapeChar));
diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs
index bd0096c82c..9e31ff20f3 100644
--- a/test/Microsoft.ML.Tests/TextLoaderTests.cs
+++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs
@@ -840,80 +840,101 @@ public void TestTextLoaderBackCompat_VerWritt_0x0001000C()
Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString());
}
- [Fact]
- public void TestCommaAsDecimalMarkerFloat()
+ [Theory]
+ [InlineData(true)]
+ [InlineData(false)]
+ public void TestCommaAsDecimalMarkerFloat(bool useCsvVersion)
{
+ // When userCsvVersion == false:
// Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
// decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
// Here, the features are of type float (Single), and the test checks for decimal markers with floats.
- var mlContext = new MLContext(seed: 1);
- UInt32[] labels = new uint[150];
- float[][] features = new float[150][];
+ // When userCsvVersion == true:
+ // Check to confirm TextLoader can read data from a CSV file where the separator is ',', decimals
+ // enclosed with quotes, and with the decimal marker being ','. Features are of type float (Single),
+ // and the test checks for decimal markers with floats.
+ var mlContext = new MLContext(seed: 1);
- // Read dataset with period as decimal marker.
- string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
- var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
+ string dataPathDecimalMarkerComma;
+ TextLoader.Options options = new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) })
},
- DecimalMarker = '.'
- });
- var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod));
+ };
+ if (useCsvVersion)
+ {
+ dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv");
+ options.DecimalMarker = ',';
+ options.Separator = ",";
+ options.AllowQuoting = true;
+ options.HasHeader = true;
+ }
+ else
+ {
+ dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
+ options.DecimalMarker = ',';
+ }
- // Load values from iris.txt
- DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema;
- using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
- UInt32 labelPeriod = default;
- ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]);
- VBuffer featuresPeriod = default;
- ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
+ // Read dataset with comma as decimal marker.
+ var readerDecimalMarkerComma = new TextLoader(mlContext, options);
+ var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
+
+ // Load values from iris database with comma as decimal marker.
+ DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
+ using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
+ UInt32 labelComma = default;
+ ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
+ VBuffer featuresComma = default;
+ ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
// Iterate over each row and save labels and features to array for future comparison
int count = 0;
- while (cursorPeriod.MoveNext())
+ UInt32[] labels = new uint[150];
+ float[][] features = new float[150][];
+ while (cursorComma.MoveNext())
{
//Get values from respective columns
- labelDelegatePeriod(ref labelPeriod);
- featuresDelegatePeriod(ref featuresPeriod);
- labels[count] = labelPeriod;
- features[count] = featuresPeriod.GetValues().ToArray();
+ labelDelegateComma(ref labelComma);
+ featuresDelegateComma(ref featuresComma);
+ labels[count] = labelComma;
+ features[count] = featuresComma.GetValues().ToArray();
count++;
}
- // Read dataset with comma as decimal marker.
- string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
- var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ // Read dataset with period as decimal marker.
+ string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
+ var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
},
- DecimalMarker = ','
+ DecimalMarker = '.'
});
- var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
+ var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod));
- // Load values from iris-decimal-marker-as-comma.txt
- DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
- using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
- UInt32 labelComma = default;
- ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
- VBuffer featuresComma = default;
- ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
+ // Load values from iris.txt where '.' is the decimal marker.
+ DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema;
+ using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
+ UInt32 labelPeriod = default;
+ ValueGetter labelDelegatePeriod = cursorComma.GetGetter(columnsPeriod[0]);
+ VBuffer featuresPeriod = default;
+ ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
- // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt
+ // Check values from database with ',' as decimal marker with database with '.' as decimal marker.
count = 0;
while (cursorComma.MoveNext())
{
//Get values from respective columns
- labelDelegateComma(ref labelComma);
- featuresDelegateComma(ref featuresComma);
- Assert.Equal(labels[count], labelComma);
- Assert.Equal(features[count], featuresComma.GetValues().ToArray());
+ labelDelegatePeriod(ref labelPeriod);
+ featuresDelegatePeriod(ref featuresPeriod);
+ Assert.Equal(labels[count], labelPeriod);
+ Assert.Equal(features[count], featuresPeriod.GetValues().ToArray());
count++;
}
}
@@ -933,7 +954,7 @@ public void TestCommaAsDecimalMarkerDouble()
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }),
},
DecimalMarker = '.'
});
@@ -947,11 +968,10 @@ public void TestCommaAsDecimalMarkerDouble()
VBuffer featuresPeriod = default;
ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
- UInt32[] labels = new uint[150];
- double[][] features = new double[150][];
-
// Iterate over each row and save labels and features to array for future comparison
int count = 0;
+ UInt32[] labels = new uint[150];
+ double[][] features = new double[150][];
while (cursorPeriod.MoveNext())
{
//Get values from respective columns
@@ -969,7 +989,7 @@ public void TestCommaAsDecimalMarkerDouble()
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }),
},
DecimalMarker = ','
});
@@ -996,144 +1016,53 @@ public void TestCommaAsDecimalMarkerDouble()
}
}
- [Fact]
- public void TestWrongDecimalMarkerInputs()
+ [Theory]
+ [InlineData(true)]
+ [InlineData(false)]
+ public void TestWrongDecimalMarkerInputs(bool useCommaAsDecimalMarker)
{
// When DecimalMarker does not match the actual decimal marker used in the dataset,
// we obtain values of NaN. Check that the values are indeed NaN in this case.
+ // Do this check for both cases where decimal markers in the dataset are '.' and ','.
var mlContext = new MLContext(seed: 1);
- // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ','.
- string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
- var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ // Try reading a dataset where '.' is the actual decimal marker, but DecimalMarker = ',',
+ // and vice versa.
+ string dataPath;
+ TextLoader.Options options = new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
+ new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) })
},
- DecimalMarker = ','
- });
- var textDataMismatched1 = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerPeriod));
+ };
+ if (useCommaAsDecimalMarker)
+ {
+ dataPath = GetDataPath("iris.txt"); // Has '.' as decimal marker inside dataset
+ options.DecimalMarker = ','; // Choose wrong decimal marker on purpose
+ }
+ else
+ {
+ dataPath = GetDataPath("iris-decimal-marker-as-comma.txt"); // Has ',' as decimal marker inside dataset
+ options.DecimalMarker = '.'; // Choose wrong decimal marker on purpose
+ }
+ var reader = new TextLoader(mlContext, options);
+ var textData = reader.Load(GetDataPath(dataPath));
// Check that the features being loaded are NaN.
- DataViewSchema columnsPeriod = textDataMismatched1.Schema;
- using DataViewRowCursor cursorPeriod = textDataMismatched1.GetRowCursor(columnsPeriod);
+ DataViewSchema columns = textData.Schema;
+ using DataViewRowCursor cursor = textData.GetRowCursor(columns);
VBuffer featuresPeriod = default;
- ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
+ ValueGetter> featuresDelegatePeriod = cursor.GetGetter>(columns[1]);
- // Iterate over each row
- while (cursorPeriod.MoveNext())
+ // Iterate over each row and check that feature values are NaN.
+ while (cursor.MoveNext())
{
featuresDelegatePeriod.Invoke(ref featuresPeriod);
foreach(float feature in featuresPeriod.GetValues().ToArray())
Assert.Equal(feature, Single.NaN);
}
-
- // Try reading a dataset where ',' is the actual decimal marker, but DecimalMarker = '.'.
- string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
- var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
- {
- Columns = new[]
- {
- new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
- },
- DecimalMarker = '.'
- });
- var textDataMismatched2 = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerComma));
-
- DataViewSchema columnsComma = textDataMismatched2.Schema;
- using DataViewRowCursor cursorComma = textDataMismatched2.GetRowCursor(columnsComma);
- VBuffer featuresComma = default;
- ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
-
- // Iterate over each row
- while (cursorComma.MoveNext())
- {
- featuresDelegateComma.Invoke(ref featuresComma);
- foreach (float feature in featuresComma.GetValues().ToArray())
- Assert.Equal(feature, Single.NaN);
- }
- }
-
- [Fact]
- public void TestCommaAsDecimalMarkerWithSeperatorAsCommaInCSV()
- {
- // Check to confirm TextLoader can read data from a CSV file where the separator is ',' and decimals
- // enclosed with quotes and with the decimal marker ',' can be successfully read.
- string dataPathCsv = GetDataPath("iris-decimal-marker-as-comma.csv");
-
- var mlContext = new MLContext(seed: 1);
- var readerCsv = new TextLoader(mlContext, new TextLoader.Options()
- {
- Columns = new[]
- {
- new TextLoader.Column("Label", DataKind.Single, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
- },
- DecimalMarker = ',',
- Separator = ",",
- AllowQuoting = true,
- HasHeader = true
- });
- var textDataCsv = readerCsv.Load(GetDataPath(dataPathCsv));
-
- // Load values from iris-decimal-marker-as-comma.csv
- DataViewSchema columnsCsv = textDataCsv.Schema;
- using DataViewRowCursor cursorCsv = textDataCsv.GetRowCursor(columnsCsv);
- UInt32 labelCsv = default;
- ValueGetter labelDelegatePeriod = cursorCsv.GetGetter(columnsCsv[0]);
- VBuffer featuresCsv = default;
- ValueGetter> featuresDelegatePeriod = cursorCsv.GetGetter>(columnsCsv[1]);
-
- UInt32[] labels = new uint[150];
- double[][] features = new double[150][];
-
- // Iterate over each row and save labels and features to array for future comparison
- int count = 0;
- while (cursorCsv.MoveNext())
- {
- //Get values from respective columns
- labelDelegatePeriod(ref labelCsv);
- featuresDelegatePeriod(ref featuresCsv);
- labels[count] = labelCsv;
- features[count] = featuresCsv.GetValues().ToArray();
- count++;
- }
-
- // Read dataset with comma as decimal marker.
- string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
- var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
- {
- Columns = new[]
- {
- new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
- },
- DecimalMarker = ','
- });
- var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
-
- // Load values from iris-decimal-marker-as-comma.txt
- DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
- using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
- UInt32 labelComma = default;
- ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
- VBuffer featuresComma = default;
- ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
-
- // Check values from iris-decimal-marker-as-comma.txt match those in iris-decimal-marker-as-comma.csv
- count = 0;
- while (cursorComma.MoveNext())
- {
- //Get values from respective columns
- labelDelegateComma(ref labelComma);
- featuresDelegateComma(ref featuresComma);
- Assert.Equal(labels[count], labelComma);
- Assert.Equal(features[count], featuresComma.GetValues().ToArray());
- count++;
- }
}
private class IrisNoFields
diff --git a/test/data/iris-decimal-marker-as-comma.csv b/test/data/iris-decimal-marker-as-comma.csv
index 0612dd281e..db1986ec8e 100644
--- a/test/data/iris-decimal-marker-as-comma.csv
+++ b/test/data/iris-decimal-marker-as-comma.csv
@@ -1,151 +1,151 @@
Label,Sepal,length,Sepal,width,Petal length,Petal width
-0,"5,1","3,5","1,4","0.2"
-0,"4,9","3,0","1,4","0.2"
-0,"4,7","3,2","1,3","0.2"
-0,"4,6","3,1","1,5","0.2"
-0,"5,0","3,6","1,4","0.2"
-0,"5,4","3,9","1,7","0.4"
-0,"4,6","3,4","1,4","0.3"
-0,"5,0","3,4","1,5","0.2"
-0,"4,4","2,9","1,4","0.2"
-0,"4,9","3,1","1,5","0.1"
-0,"5,4","3,7","1,5","0.2"
-0,"4,8","3,4","1,6","0.2"
-0,"4,8","3,0","1,4","0.1"
-0,"4,3","3,0","1,1","0.1"
-0,"5,8","4,0","1,2","0.2"
-0,"5,7","4,4","1,5","0.4"
-0,"5,4","3,9","1,3","0.4"
-0,"5,1","3,5","1,4","0.3"
-0,"5,7","3,8","1,7","0.3"
-0,"5,1","3,8","1,5","0.3"
-0,"5,4","3,4","1,7","0.2"
-0,"5,1","3,7","1,5","0.4"
-0,"4,6","3,6","1,0","0.2"
+0,"5,1","3,5","1,4","0,2"
+0,"4,9","3,0","1,4","0,2"
+0,"4,7","3,2","1,3","0,2"
+0,"4,6","3,1","1,5","0,2"
+0,"5,0","3,6","1,4","0,2"
+0,"5,4","3,9","1,7","0,4"
+0,"4,6","3,4","1,4","0,3"
+0,"5,0","3,4","1,5","0,2"
+0,"4,4","2,9","1,4","0,2"
+0,"4,9","3,1","1,5","0,1"
+0,"5,4","3,7","1,5","0,2"
+0,"4,8","3,4","1,6","0,2"
+0,"4,8","3,0","1,4","0,1"
+0,"4,3","3,0","1,1","0,1"
+0,"5,8","4,0","1,2","0,2"
+0,"5,7","4,4","1,5","0,4"
+0,"5,4","3,9","1,3","0,4"
+0,"5,1","3,5","1,4","0,3"
+0,"5,7","3,8","1,7","0,3"
+0,"5,1","3,8","1,5","0,3"
+0,"5,4","3,4","1,7","0,2"
+0,"5,1","3,7","1,5","0,4"
+0,"4,6","3,6","1,0","0,2"
0,"5,1","3,3","1,7","0,5"
-0,"4,8","3,4","1,9","0.2"
-0,"5,0","3,0","1,6","0.2"
-0,"5,0","3,4","1,6","0.4"
-0,"5,2","3,5","1,5","0.2"
-0,"5,2","3,4","1,4","0.2"
-0,"4,7","3,2","1,6","0.2"
-0,"4,8","3,1","1,6","0.2"
-0,"5,4","3,4","1,5","0.4"
-0,"5,2","4,1","1,5","0.1"
-0,"5,5","4,2","1,4","0.2"
-0,"4,9","3,1","1,5","0.1"
-0,"5,0","3,2","1,2","0.2"
-0,"5,5","3,5","1,3","0.2"
-0,"4,9","3,1","1,5","0.1"
-0,"4,4","3,0","1,3","0.2"
-0,"5,1","3,4","1,5","0.2"
-0,"5,0","3,5","1,3","0.3"
-0,"4,5","2,3","1,3","0.3"
-0,"4,4","3,2","1,3","0.2"
-0,"5,0","3,5","1,6","0.6"
-0,"5,1","3,8","1,9","0.4"
-0,"4,8","3,0","1,4","0.3"
-0,"5,1","3,8","1,6","0.2"
-0,"4,6","3,2","1,4","0.2"
-0,"5,3","3,7","1,5","0.2"
-0,"5,0","3,3","1,4","0.2"
-0,"7,0","3,2","4,7","1,4
-0,"6,4","3,2","4,5","1,5
-0,"6,9","3,1","4,9","1,5
-0,"5,5","2,3","4,0","1,3
-0,"6,5","2,8","4,6","1,5
-0,"5,7","2,8","4,5","1,3
-0,"6,3","3,3","4,7","1,6
-0,"4,9","2,4","3,3","1,0
-0,"6,6","2,9","4,6","1,3
-0,"5,2","2,7","3,9","1,4
-0,"5,0","2,0","3,5","1,0
-0,"5,9","3,0","4,2","1,5
-0,"6,0","2,2","4,0","1,0
-0,"6,1","2,9","4,7","1,4
-0,"5,6","2,9","3,6","1,3
-0,"6,7","3,1","4,4","1,4
-0,"5,6","3,0","4,5","1,5
-0,"5,8","2,7","4,1","1,0
-0,"6,2","2,2","4,5","1,5
-0,"5,6","2,5","3,9","1,1
-0,"5,9","3,2","4,8","1,8
-0,"6,1","2,8","4,0","1,3
-0,"6,3","2,5","4,9","1,5
-0,"6,1","2,8","4,7","1,2
-0,"6,4","2,9","4,3","1,3
-0,"6,6","3,0","4,4","1,4
-0,"6,8","2,8","4,8","1,4
-0,"6,7","3,0","5,0","1,7
-0,"6,0","2,9","4,5","1,5
-0,"5,7","2,6","3,5","1,0
-0,"5,5","2,4","3,8","1,1
-0,"5,5","2,4","3,7","1,0
-0,"5,8","2,7","3,9","1,2
-0,"6,0","2,7","5,1","1,6
-0,"5,4","3,0","4,5","1,5
-0,"6,0","3,4","4,5","1,6
-0,"6,7","3,1","4,7","1,5
-0,"6,3","2,3","4,4","1,3
-0,"5,6","3,0","4,1","1,3
-0,"5,5","2,5","4,0","1,3
-0,"5,5","2,6","4,4","1,2
-0,"6,1","3,0","4,6","1,4
-0,"5,8","2,6","4,0","1,2
-0,"5,0","2,3","3,3","1,0
-0,"5,6","2,7","4,2","1,3
-0,"5,7","3,0","4,2","1,2
-0,"5,7","2,9","4,2","1,3
-0,"6,2","2,9","4,3","1,3
-0,"5,1","2,5","3,0","1,1
-0,"5,7","2,8","4,1","1,3
-0,"6,3","3,3","6,0","2,5
-0,"5,8","2,7","5,1","1,9
-0,"7,1","3,0","5,9","2,1
-0,"6,3","2,9","5,6","1,8
-0,"6,5","3,0","5,8","2,2
-0,"7,6","3,0","6,6","2,1
-0,"4,9","2,5","4,5","1,7
-0,"7,3","2,9","6,3","1,8
-0,"6,7","2,5","5,8","1,8
-0,"7,2","3,6","6,1","2,5
-0,"6,5","3,2","5,1","2,0
-0,"6,4","2,7","5,3","1,9
-0,"6,8","3,0","5,5","2,1
-0,"5,7","2,5","5,0","2,0
-0,"5,8","2,8","5,1","2,4
-0,"6,4","3,2","5,3","2,3
-0,"6,5","3,0","5,5","1,8
-0,"7,7","3,8","6,7","2,2
-0,"7,7","2,6","6,9","2,3
-0,"6,0","2,2","5,0","1,5
-0,"6,9","3,2","5,7","2,3
-0,"5,6","2,8","4,9","2,0
-0,"7,7","2,8","6,7","2,0
-0,"6,3","2,7","4,9","1,8
-0,"6,7","3,3","5,7","2,1
-0,"7,2","3,2","6,0","1,8
-0,"6,2","2,8","4,8","1,8
-0,"6,1","3,0","4,9","1,8
-0,"6,4","2,8","5,6","2,1
-0,"7,2","3,0","5,8","1,6
-0,"7,4","2,8","6,1","1,9
-0,"7,9","3,8","6,4","2,0
-0,"6,4","2,8","5,6","2,2
-0,"6,3","2,8","5,1","1,5
-0,"6,1","2,6","5,6","1,4
-0,"7,7","3,0","6,1","2,3
-0,"6,3","3,4","5,6","2,4
-0,"6,4","3,1","5,5","1,8
-0,"6,0","3,0","4,8","1,8
-0,"6,9","3,1","5,4","2,1
-0,"6,7","3,1","5,6","2,4
-0,"6,9","3,1","5,1","2,3
-0,"5,8","2,7","5,1","1,9
-0,"6,8","3,2","5,9","2,3
-0,"6,7","3,3","5,7","2,5
-0,"6,7","3,0","5,2","2,3
-0,"6,3","2,5","5,0","1,9
-0,"6,5","3,0","5,2","2,0
-0,"6,2","3,4","5,4","2,3
-0,"5,9","3,0","5,1","1,8
+0,"4,8","3,4","1,9","0,2"
+0,"5,0","3,0","1,6","0,2"
+0,"5,0","3,4","1,6","0,4"
+0,"5,2","3,5","1,5","0,2"
+0,"5,2","3,4","1,4","0,2"
+0,"4,7","3,2","1,6","0,2"
+0,"4,8","3,1","1,6","0,2"
+0,"5,4","3,4","1,5","0,4"
+0,"5,2","4,1","1,5","0,1"
+0,"5,5","4,2","1,4","0,2"
+0,"4,9","3,1","1,5","0,1"
+0,"5,0","3,2","1,2","0,2"
+0,"5,5","3,5","1,3","0,2"
+0,"4,9","3,1","1,5","0,1"
+0,"4,4","3,0","1,3","0,2"
+0,"5,1","3,4","1,5","0,2"
+0,"5,0","3,5","1,3","0,3"
+0,"4,5","2,3","1,3","0,3"
+0,"4,4","3,2","1,3","0,2"
+0,"5,0","3,5","1,6","0,6"
+0,"5,1","3,8","1,9","0,4"
+0,"4,8","3,0","1,4","0,3"
+0,"5,1","3,8","1,6","0,2"
+0,"4,6","3,2","1,4","0,2"
+0,"5,3","3,7","1,5","0,2"
+0,"5,0","3,3","1,4","0,2"
+1,"7,0","3,2","4,7","1,4"
+1,"6,4","3,2","4,5","1,5"
+1,"6,9","3,1","4,9","1,5"
+1,"5,5","2,3","4,0","1,3"
+1,"6,5","2,8","4,6","1,5"
+1,"5,7","2,8","4,5","1,3"
+1,"6,3","3,3","4,7","1,6"
+1,"4,9","2,4","3,3","1,0"
+1,"6,6","2,9","4,6","1,3"
+1,"5,2","2,7","3,9","1,4"
+1,"5,0","2,0","3,5","1,0"
+1,"5,9","3,0","4,2","1,5"
+1,"6,0","2,2","4,0","1,0"
+1,"6,1","2,9","4,7","1,4"
+1,"5,6","2,9","3,6","1,3"
+1,"6,7","3,1","4,4","1,4"
+1,"5,6","3,0","4,5","1,5"
+1,"5,8","2,7","4,1","1,0"
+1,"6,2","2,2","4,5","1,5"
+1,"5,6","2,5","3,9","1,1"
+1,"5,9","3,2","4,8","1,8"
+1,"6,1","2,8","4,0","1,3"
+1,"6,3","2,5","4,9","1,5"
+1,"6,1","2,8","4,7","1,2"
+1,"6,4","2,9","4,3","1,3"
+1,"6,6","3,0","4,4","1,4"
+1,"6,8","2,8","4,8","1,4"
+1,"6,7","3,0","5,0","1,7"
+1,"6,0","2,9","4,5","1,5"
+1,"5,7","2,6","3,5","1,0"
+1,"5,5","2,4","3,8","1,1"
+1,"5,5","2,4","3,7","1,0"
+1,"5,8","2,7","3,9","1,2"
+1,"6,0","2,7","5,1","1,6"
+1,"5,4","3,0","4,5","1,5"
+1,"6,0","3,4","4,5","1,6"
+1,"6,7","3,1","4,7","1,5"
+1,"6,3","2,3","4,4","1,3"
+1,"5,6","3,0","4,1","1,3"
+1,"5,5","2,5","4,0","1,3"
+1,"5,5","2,6","4,4","1,2"
+1,"6,1","3,0","4,6","1,4"
+1,"5,8","2,6","4,0","1,2"
+1,"5,0","2,3","3,3","1,0"
+1,"5,6","2,7","4,2","1,3"
+1,"5,7","3,0","4,2","1,2"
+1,"5,7","2,9","4,2","1,3"
+1,"6,2","2,9","4,3","1,3"
+1,"5,1","2,5","3,0","1,1"
+1,"5,7","2,8","4,1","1,3"
+2,"6,3","3,3","6,0","2,5"
+2,"5,8","2,7","5,1","1,9"
+2,"7,1","3,0","5,9","2,1"
+2,"6,3","2,9","5,6","1,8"
+2,"6,5","3,0","5,8","2,2"
+2,"7,6","3,0","6,6","2,1"
+2,"4,9","2,5","4,5","1,7"
+2,"7,3","2,9","6,3","1,8"
+2,"6,7","2,5","5,8","1,8"
+2,"7,2","3,6","6,1","2,5"
+2,"6,5","3,2","5,1","2,0"
+2,"6,4","2,7","5,3","1,9"
+2,"6,8","3,0","5,5","2,1"
+2,"5,7","2,5","5,0","2,0"
+2,"5,8","2,8","5,1","2,4"
+2,"6,4","3,2","5,3","2,3"
+2,"6,5","3,0","5,5","1,8"
+2,"7,7","3,8","6,7","2,2"
+2,"7,7","2,6","6,9","2,3"
+2,"6,0","2,2","5,0","1,5"
+2,"6,9","3,2","5,7","2,3"
+2,"5,6","2,8","4,9","2,0"
+2,"7,7","2,8","6,7","2,0"
+2,"6,3","2,7","4,9","1,8"
+2,"6,7","3,3","5,7","2,1"
+2,"7,2","3,2","6,0","1,8"
+2,"6,2","2,8","4,8","1,8"
+2,"6,1","3,0","4,9","1,8"
+2,"6,4","2,8","5,6","2,1"
+2,"7,2","3,0","5,8","1,6"
+2,"7,4","2,8","6,1","1,9"
+2,"7,9","3,8","6,4","2,0"
+2,"6,4","2,8","5,6","2,2"
+2,"6,3","2,8","5,1","1,5"
+2,"6,1","2,6","5,6","1,4"
+2,"7,7","3,0","6,1","2,3"
+2,"6,3","3,4","5,6","2,4"
+2,"6,4","3,1","5,5","1,8"
+2,"6,0","3,0","4,8","1,8"
+2,"6,9","3,1","5,4","2,1"
+2,"6,7","3,1","5,6","2,4"
+2,"6,9","3,1","5,1","2,3"
+2,"5,8","2,7","5,1","1,9"
+2,"6,8","3,2","5,9","2,3"
+2,"6,7","3,3","5,7","2,5"
+2,"6,7","3,0","5,2","2,3"
+2,"6,3","2,5","5,0","1,9"
+2,"6,5","3,0","5,2","2,0"
+2,"6,2","3,4","5,4","2,3"
+2,"5,9","3,0","5,1","1,8"
From 247af634301e7116481dec8676b8d0dfcd3c557e Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Fri, 22 May 2020 11:01:45 -0700
Subject: [PATCH 09/10] nit fix
---
src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 2c64b98a14..6bc58de054 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -1229,7 +1229,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
if (options.DecimalMarker != '.' && options.DecimalMarker != ',')
throw _host.ExceptUserArg(nameof(Options.DecimalMarker), "Decimal marker cannot be the '{0}' character. It must be '.' or ','.", options.DecimalMarker);
- if (!options.AllowQuoting && options.DecimalMarker == ',' && options.Separator == ",")
+ if (!options.AllowQuoting && options.DecimalMarker == ',' && _separators.Contains(','))
throw _host.ExceptUserArg(nameof(Options.AllowQuoting), "Quoting must be allowed if decimal marker and separator are the ',' character.");
_decimalMarker = options.DecimalMarker;
_escapeChar = options.EscapeChar;
From 8492472c0ca808ffd7397f37370029e618f0b25a Mon Sep 17 00:00:00 2001
From: Mustafa Bal <5262061+mstfbl@users.noreply.github.com>
Date: Fri, 22 May 2020 12:50:35 -0700
Subject: [PATCH 10/10] Compressed tests using
---
test/Microsoft.ML.Tests/TextLoaderTests.cs | 143 ++++++---------------
1 file changed, 40 insertions(+), 103 deletions(-)
diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs
index 9e31ff20f3..a4d44c5cc2 100644
--- a/test/Microsoft.ML.Tests/TextLoaderTests.cs
+++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs
@@ -843,108 +843,30 @@ public void TestTextLoaderBackCompat_VerWritt_0x0001000C()
[Theory]
[InlineData(true)]
[InlineData(false)]
- public void TestCommaAsDecimalMarkerFloat(bool useCsvVersion)
+ public void TestCommaAsDecimalMarker(bool useCsvVersion)
{
// When userCsvVersion == false:
// Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
// decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
- // Here, the features are of type float (Single), and the test checks for decimal markers with floats.
// When userCsvVersion == true:
// Check to confirm TextLoader can read data from a CSV file where the separator is ',', decimals
- // enclosed with quotes, and with the decimal marker being ','. Features are of type float (Single),
- // and the test checks for decimal markers with floats.
- var mlContext = new MLContext(seed: 1);
+ // are enclosed with quotes, and with the decimal marker being ','.
- string dataPathDecimalMarkerComma;
- TextLoader.Options options = new TextLoader.Options()
- {
- Columns = new[]
- {
- new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) })
- },
- };
- if (useCsvVersion)
- {
- dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv");
- options.DecimalMarker = ',';
- options.Separator = ",";
- options.AllowQuoting = true;
- options.HasHeader = true;
- }
- else
- {
- dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
- options.DecimalMarker = ',';
- }
-
- // Read dataset with comma as decimal marker.
- var readerDecimalMarkerComma = new TextLoader(mlContext, options);
- var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
-
- // Load values from iris database with comma as decimal marker.
- DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
- using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
- UInt32 labelComma = default;
- ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
- VBuffer featuresComma = default;
- ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
-
- // Iterate over each row and save labels and features to array for future comparison
- int count = 0;
- UInt32[] labels = new uint[150];
- float[][] features = new float[150][];
- while (cursorComma.MoveNext())
- {
- //Get values from respective columns
- labelDelegateComma(ref labelComma);
- featuresDelegateComma(ref featuresComma);
- labels[count] = labelComma;
- features[count] = featuresComma.GetValues().ToArray();
- count++;
- }
-
- // Read dataset with period as decimal marker.
- string dataPathDecimalMarkerPeriod = GetDataPath("iris.txt");
- var readerDecimalMarkerPeriod = new TextLoader(mlContext, new TextLoader.Options()
- {
- Columns = new[]
- {
- new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
- },
- DecimalMarker = '.'
- });
- var textDataDecimalMarkerPeriod = readerDecimalMarkerPeriod.Load(GetDataPath(dataPathDecimalMarkerPeriod));
-
- // Load values from iris.txt where '.' is the decimal marker.
- DataViewSchema columnsPeriod = textDataDecimalMarkerPeriod.Schema;
- using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
- UInt32 labelPeriod = default;
- ValueGetter labelDelegatePeriod = cursorComma.GetGetter(columnsPeriod[0]);
- VBuffer featuresPeriod = default;
- ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
-
- // Check values from database with ',' as decimal marker with database with '.' as decimal marker.
- count = 0;
- while (cursorComma.MoveNext())
- {
- //Get values from respective columns
- labelDelegatePeriod(ref labelPeriod);
- featuresDelegatePeriod(ref featuresPeriod);
- Assert.Equal(labels[count], labelPeriod);
- Assert.Equal(features[count], featuresPeriod.GetValues().ToArray());
- count++;
- }
+ // Do these checks with both float and double as types of features being read, to test decimal marker
+ // recognition with both doubles and floats.
+ TestCommaAsDecimalMarkerHelper(useCsvVersion);
+ TestCommaAsDecimalMarkerHelper(useCsvVersion);
}
-
- [Fact]
- public void TestCommaAsDecimalMarkerDouble()
+
+ private void TestCommaAsDecimalMarkerHelper(bool useCsvVersion)
{
// Datasets iris.txt and iris-decimal-marker-as-comma.txt are the exact same, except for their
// decimal markers. Decimal marker in iris.txt is '.', and ',' in iris-decimal-marker-as-comma.txt.
- // Here, the features are of type double, and the test checks for decimal markers with double.
+ // Datasets iris.txt and iris-decimal-marker-as-comma.csv have the exact same data, however the .csv
+ // version has ',' as decimal marker and separator, and feature values are enclosed with quotes.
+ // T varies as either float or double, so that decimal markers can be tested for both floating
+ // point value types.
var mlContext = new MLContext(seed: 1);
// Read dataset with period as decimal marker.
@@ -954,7 +876,7 @@ public void TestCommaAsDecimalMarkerDouble()
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }),
+ new TextLoader.Column("Features", typeof(T) == typeof(double) ? DataKind.Double : DataKind.Single, new [] { new TextLoader.Range(1, 4) }),
},
DecimalMarker = '.'
});
@@ -965,13 +887,13 @@ public void TestCommaAsDecimalMarkerDouble()
using DataViewRowCursor cursorPeriod = textDataDecimalMarkerPeriod.GetRowCursor(columnsPeriod);
UInt32 labelPeriod = default;
ValueGetter labelDelegatePeriod = cursorPeriod.GetGetter(columnsPeriod[0]);
- VBuffer featuresPeriod = default;
- ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
+ VBuffer featuresPeriod = default;
+ ValueGetter> featuresDelegatePeriod = cursorPeriod.GetGetter>(columnsPeriod[1]);
// Iterate over each row and save labels and features to array for future comparison
int count = 0;
UInt32[] labels = new uint[150];
- double[][] features = new double[150][];
+ T[][] features = new T[150][];
while (cursorPeriod.MoveNext())
{
//Get values from respective columns
@@ -983,27 +905,42 @@ public void TestCommaAsDecimalMarkerDouble()
}
// Read dataset with comma as decimal marker.
- string dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
- var readerDecimalMarkerComma = new TextLoader(mlContext, new TextLoader.Options()
+ // Dataset is either the .csv version or the .txt version.
+ string dataPathDecimalMarkerComma;
+ TextLoader.Options options = new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column("Label", DataKind.UInt32, 0),
- new TextLoader.Column("Features", DataKind.Double, new [] { new TextLoader.Range(1, 4) }),
+ new TextLoader.Column("Features", typeof(T) == typeof(double) ? DataKind.Double : DataKind.Single, new [] { new TextLoader.Range(1, 4) })
},
- DecimalMarker = ','
- });
+ };
+ // Set TextLoader.Options for the .csv or .txt cases.
+ if (useCsvVersion)
+ {
+ dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.csv");
+ options.DecimalMarker = ',';
+ options.Separator = ",";
+ options.AllowQuoting = true;
+ options.HasHeader = true;
+ }
+ else
+ {
+ dataPathDecimalMarkerComma = GetDataPath("iris-decimal-marker-as-comma.txt");
+ options.DecimalMarker = ',';
+ }
+ var readerDecimalMarkerComma = new TextLoader(mlContext, options);
var textDataDecimalMarkerComma = readerDecimalMarkerComma.Load(GetDataPath(dataPathDecimalMarkerComma));
- // Load values from iris-decimal-marker-as-comma.txt
+ // Load values from dataset with comma as decimal marker
DataViewSchema columnsComma = textDataDecimalMarkerComma.Schema;
using DataViewRowCursor cursorComma = textDataDecimalMarkerComma.GetRowCursor(columnsComma);
UInt32 labelComma = default;
ValueGetter labelDelegateComma = cursorComma.GetGetter(columnsComma[0]);
- VBuffer featuresComma = default;
- ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
+ VBuffer featuresComma = default;
+ ValueGetter> featuresDelegateComma = cursorComma.GetGetter>(columnsComma[1]);
- // Check values from iris-decimal-marker-as-comma.txt match those in iris.txt
+ // Check values from dataset with comma as decimal marker match those in iris.txt (period decimal marker)
count = 0;
while (cursorComma.MoveNext())
{