From c073e6b3bcb271afe002cd1fb354bb2309551a28 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Tue, 28 Apr 2020 12:25:38 -0700 Subject: [PATCH 1/5] upgrade to 3.1 --- .../Templates/Console/PredictProject.cs | 2 +- .../Templates/Console/PredictProject.tt | 2 +- ...s.AzureCodeGeneratorTest.test.ConsoleApp.csproj.approved.txt | 2 +- ...CodeGeneratorTest.CodeGenTest.ConsoleApp.csproj.approved.txt | 2 +- ...GeneratorTests.ConsoleAppProjectFileContentTest.approved.txt | 2 +- ...eConsoleAppProjectContents_VerifyPredictProject.approved.txt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.cs b/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.cs index 96c4bbf124..29f288a076 100644 --- a/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.cs +++ b/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.cs @@ -27,7 +27,7 @@ internal partial class PredictProject : PredictProjectBase public virtual string TransformText() { this.Write("\r\n\r\n \r\n Exe\r\n netcoreapp2.1\r\n \r\n netcoreapp3.1\r\n \r\n \r\n \r\n"); diff --git a/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.tt b/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.tt index 1dd932085e..15e82d409c 100644 --- a/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.tt +++ b/src/Microsoft.ML.CodeGenerator/Templates/Console/PredictProject.tt @@ -8,7 +8,7 @@ Exe - netcoreapp2.1 + netcoreapp3.1 diff --git a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureCodeGeneratorTest.test.ConsoleApp.csproj.approved.txt b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureCodeGeneratorTest.test.ConsoleApp.csproj.approved.txt index 66e5ca3695..f94467d5dc 100644 --- a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureCodeGeneratorTest.test.ConsoleApp.csproj.approved.txt +++ b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureCodeGeneratorTest.test.ConsoleApp.csproj.approved.txt @@ -2,7 +2,7 @@ Exe - netcoreapp2.1 + netcoreapp3.1 diff --git a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureImageCodeGeneratorTest.CodeGenTest.ConsoleApp.csproj.approved.txt b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureImageCodeGeneratorTest.CodeGenTest.ConsoleApp.csproj.approved.txt index 23fa52e0c1..5a0f249e82 100644 --- a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureImageCodeGeneratorTest.CodeGenTest.ConsoleApp.csproj.approved.txt +++ b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.AzureImageCodeGeneratorTest.CodeGenTest.ConsoleApp.csproj.approved.txt @@ -2,7 +2,7 @@ Exe - netcoreapp2.1 + netcoreapp3.1 diff --git a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.ConsoleAppProjectFileContentTest.approved.txt b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.ConsoleAppProjectFileContentTest.approved.txt index 9ef4077dee..97df9ea672 100644 --- a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.ConsoleAppProjectFileContentTest.approved.txt +++ b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.ConsoleAppProjectFileContentTest.approved.txt @@ -2,7 +2,7 @@ Exe - netcoreapp2.1 + netcoreapp3.1 diff --git a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.Recommendation_GenerateConsoleAppProjectContents_VerifyPredictProject.approved.txt b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.Recommendation_GenerateConsoleAppProjectContents_VerifyPredictProject.approved.txt index 14a45f8c0c..e261bed0fd 100644 --- a/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.Recommendation_GenerateConsoleAppProjectContents_VerifyPredictProject.approved.txt +++ b/test/Microsoft.ML.CodeGenerator.Tests/ApprovalTests/ConsoleCodeGeneratorTests.Recommendation_GenerateConsoleAppProjectContents_VerifyPredictProject.approved.txt @@ -2,7 +2,7 @@ Exe - netcoreapp2.1 + netcoreapp3.1 From 8f0fc1a59dd3aa6186ec4e94f9cccf93fab4dae4 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Tue, 28 Apr 2020 12:31:31 -0700 Subject: [PATCH 2/5] write inline data using invariantCulture --- src/Microsoft.ML.CodeGenerator/Utils.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.CodeGenerator/Utils.cs b/src/Microsoft.ML.CodeGenerator/Utils.cs index 216097064c..075bc577ce 100644 --- a/src/Microsoft.ML.CodeGenerator/Utils.cs +++ b/src/Microsoft.ML.CodeGenerator/Utils.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Globalization; using System.IO; using System.Linq; using System.Reflection; @@ -101,7 +102,7 @@ internal static string GetValueFromColumn(DataViewRowCursor rowCursor, DataVi return "Single.NegativeInfinity"; } - return f?.ToString() + "F"; + return f?.ToString(CultureInfo.InvariantCulture) + "F"; } if (val is bool) From ecaaaf0eeb8293c73dd561c0dfd61d321c314ebf Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 20 May 2020 12:06:28 -0700 Subject: [PATCH 3/5] add tryMulti in AutoML and test --- .../ColumnInference/TextFileContents.cs | 7 ++-- .../ColumnInferenceTests.cs | 14 ++++++++ .../Microsoft.ML.AutoML.Tests.csproj | 4 +++ .../DatasetWithNewlineBetweenQuotes.txt | 36 +++++++++++++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs index fe0066ab6e..2a249dced6 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs @@ -50,12 +50,14 @@ public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamS { var sparse = new[] { false, true }; var quote = new[] { true, false }; + var tryMultiline = new[] { false, true }; var foundAny = false; var result = default(ColumnSplitResult); foreach (var perm in (from _allowSparse in sparse from _allowQuote in quote from _sep in separatorCandidates - select new { _allowSparse, _allowQuote, _sep })) + from _tryMultiline in tryMultiline + select new { _allowSparse, _allowQuote, _sep, _tryMultiline })) { var options = new TextLoader.Options { @@ -66,7 +68,8 @@ from _sep in separatorCandidates } }, Separators = new[] { perm._sep }, AllowQuoting = perm._allowQuote, - AllowSparse = perm._allowSparse + AllowSparse = perm._allowSparse, + ReadMultilines = perm._tryMultiline, }; if (TryParseFile(context, options, source, out result)) diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs index eba7b9b7b3..695eb82eca 100644 --- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using FluentAssertions; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Xunit; @@ -186,5 +187,18 @@ public void InferColumnsColumnInfoParam() Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First()); Assert.Null(result.ColumnInformation.ExampleWeightColumnName); } + + [Fact] + public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes() + { + var context = new MLContext(); + var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt"); + var sample = TextFileSample.CreateFromFullFile(dataset); + var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators); + + result.ColumnCount.Should().Be(4); + result.Separator.Should().Be(','); + result.IsSuccess.Should().BeTrue(); + } } } \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj index 1a5946558a..af86334edf 100644 --- a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj +++ b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj @@ -7,10 +7,14 @@ + + + PreserveNewest + PreserveNewest diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt new file mode 100644 index 0000000000..afaf368195 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt @@ -0,0 +1,36 @@ +id,Column1,Column2,Column3 +1,this is a description, 1,2 +2,"this is a quote description",1,2 +3,"this is a quote description with double quote("")",1,2 +4,"this is a quote description with ""a pair of double quote""",1,2 +5,"this is a quote description with new line +quote",1,2 +6,"this is a quote description with +new line1 and +new line2 and empty line + +and double quote""",1,2 +7, this is a description with single quote("),1,2 +// empty line between quotes +8,"",1,2 +// single quote between quotes +9,"""",1,2 +// simply newline between quotes +10," + + + +",1,2 +// simply signle quote and newline between quotes +11," + +"""" + +"" + +"" + +",1,2 + + + From e7cb2b50c8e0abd870adf1fa9cf4dd0e919504cc Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 20 May 2020 13:20:27 -0700 Subject: [PATCH 4/5] add test for AutoML inferColumn API --- .../ColumnInference/ColumnInferenceApi.cs | 3 +++ .../ColumnInference/ColumnTypeInference.cs | 2 ++ .../ColumnInference/TextFileContents.cs | 8 ++++--- .../ColumnInferenceTests.cs | 21 +++++++++++++++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs index a50806f780..a9210c7369 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs @@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path Separators = new[] { splitInference.Separator.Value }, AllowSparse = splitInference.AllowSparse, AllowQuoting = splitInference.AllowQuote, + ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, TrimWhitespace = trimWhitespace }; @@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path AllowQuoting = splitInference.AllowQuote, AllowSparse = splitInference.AllowSparse, Separators = new char[] { splitInference.Separator.Value }, + ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, TrimWhitespace = trimWhitespace }; @@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co Separator = splitInference.Separator.Value, AllowSparse = splitInference.AllowSparse, AllowQuote = splitInference.AllowQuote, + ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, LabelColumnIndex = labelColumnIndex, Label = label diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs index b8acb44de4..6d00054dc5 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs @@ -32,6 +32,7 @@ internal sealed class Arguments public int MaxRowsToRead; public uint? LabelColumnIndex; public string Label; + public bool ReadMultilines; public Arguments() { @@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I Separators = new[] { args.Separator }, AllowSparse = args.AllowSparse, AllowQuoting = args.AllowQuote, + ReadMultilines = args.ReadMultilines, }; var textLoader = context.Data.CreateTextLoader(textLoaderOptions); var idv = textLoader.Load(fileSource); diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs index 2a249dced6..d04a025626 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs @@ -23,14 +23,16 @@ public class ColumnSplitResult public bool AllowQuote { get; set; } public bool AllowSparse { get; set; } + public bool ReadMultilines { get; set; } - public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount) + public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount) { IsSuccess = isSuccess; Separator = separator; AllowQuote = allowQuote; AllowSparse = allowSparse; ColumnCount = columnCount; + ReadMultilines = readMultilines; } } @@ -78,7 +80,7 @@ from _tryMultiline in tryMultiline break; } } - return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0); + return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0); } private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source, @@ -114,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options, // disallow single-column case if (mostCommon.Key <= 1) { return false; } - result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key); + result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key); return true; } // fail gracefully if unable to instantiate data view with swept arguments diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs index 695eb82eca..96f12f78db 100644 --- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -200,5 +200,26 @@ public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_ result.Separator.Should().Be(','); result.IsSuccess.Should().BeTrue(); } + + [Fact] + public void InferColumnsFromMultilineInputFile() + { + // Check if we can infer the column information + // from and input file which has escaped newlines inside quotes + var dataPath = GetDataPath("multiline.csv"); + MLContext mlContext = new MLContext(); + var inputColumnInformation = new ColumnInformation(); + inputColumnInformation.LabelColumnName = @"id"; + var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation); + + // File only have 3 columns: "id", "description" and "animal" + Assert.NotNull(result.ColumnInformation.LabelColumnName); + Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count); + Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count); + + Assert.Equal("id", result.ColumnInformation.LabelColumnName); + Assert.Equal("description", result.ColumnInformation.TextColumnNames.First()); + Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First()); + } } } \ No newline at end of file From 2c04d09625be892d1a9b201e1152a854b975a26f Mon Sep 17 00:00:00 2001 From: Xiaoyun Zhang Date: Thu, 21 May 2020 11:43:44 -0700 Subject: [PATCH 5/5] Update test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs Co-authored-by: Justin Ormont --- test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs index 96f12f78db..d1b0041472 100644 --- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -212,7 +212,7 @@ public void InferColumnsFromMultilineInputFile() inputColumnInformation.LabelColumnName = @"id"; var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation); - // File only have 3 columns: "id", "description" and "animal" + // File has 3 columns: "id", "description" and "animal" Assert.NotNull(result.ColumnInformation.LabelColumnName); Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count); Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count); @@ -222,4 +222,4 @@ public void InferColumnsFromMultilineInputFile() Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First()); } } -} \ No newline at end of file +}