diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs index a50806f780..a9210c7369 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs @@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path Separators = new[] { splitInference.Separator.Value }, AllowSparse = splitInference.AllowSparse, AllowQuoting = splitInference.AllowQuote, + ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, TrimWhitespace = trimWhitespace }; @@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path AllowQuoting = splitInference.AllowQuote, AllowSparse = splitInference.AllowSparse, Separators = new char[] { splitInference.Separator.Value }, + ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, TrimWhitespace = trimWhitespace }; @@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co Separator = splitInference.Separator.Value, AllowSparse = splitInference.AllowSparse, AllowQuote = splitInference.AllowQuote, + ReadMultilines = splitInference.ReadMultilines, HasHeader = hasHeader, LabelColumnIndex = labelColumnIndex, Label = label diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs index b8acb44de4..6d00054dc5 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs @@ -32,6 +32,7 @@ internal sealed class Arguments public int MaxRowsToRead; public uint? LabelColumnIndex; public string Label; + public bool ReadMultilines; public Arguments() { @@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I Separators = new[] { args.Separator }, AllowSparse = args.AllowSparse, AllowQuoting = args.AllowQuote, + ReadMultilines = args.ReadMultilines, }; var textLoader = context.Data.CreateTextLoader(textLoaderOptions); var idv = textLoader.Load(fileSource); diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs index fe0066ab6e..d04a025626 100644 --- a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs @@ -23,14 +23,16 @@ public class ColumnSplitResult public bool AllowQuote { get; set; } public bool AllowSparse { get; set; } + public bool ReadMultilines { get; set; } - public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount) + public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount) { IsSuccess = isSuccess; Separator = separator; AllowQuote = allowQuote; AllowSparse = allowSparse; ColumnCount = columnCount; + ReadMultilines = readMultilines; } } @@ -50,12 +52,14 @@ public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamS { var sparse = new[] { false, true }; var quote = new[] { true, false }; + var tryMultiline = new[] { false, true }; var foundAny = false; var result = default(ColumnSplitResult); foreach (var perm in (from _allowSparse in sparse from _allowQuote in quote from _sep in separatorCandidates - select new { _allowSparse, _allowQuote, _sep })) + from _tryMultiline in tryMultiline + select new { _allowSparse, _allowQuote, _sep, _tryMultiline })) { var options = new TextLoader.Options { @@ -66,7 +70,8 @@ from _sep in separatorCandidates } }, Separators = new[] { perm._sep }, AllowQuoting = perm._allowQuote, - AllowSparse = perm._allowSparse + AllowSparse = perm._allowSparse, + ReadMultilines = perm._tryMultiline, }; if (TryParseFile(context, options, source, out result)) @@ -75,7 +80,7 @@ from _sep in separatorCandidates break; } } - return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0); + return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0); } private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source, @@ -111,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options, // disallow single-column case if (mostCommon.Key <= 1) { return false; } - result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key); + result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key); return true; } // fail gracefully if unable to instantiate data view with swept arguments diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs index eba7b9b7b3..d1b0041472 100644 --- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using FluentAssertions; using Microsoft.ML.Data; using Microsoft.ML.TestFramework; using Xunit; @@ -186,5 +187,39 @@ public void InferColumnsColumnInfoParam() Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First()); Assert.Null(result.ColumnInformation.ExampleWeightColumnName); } + + [Fact] + public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes() + { + var context = new MLContext(); + var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt"); + var sample = TextFileSample.CreateFromFullFile(dataset); + var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators); + + result.ColumnCount.Should().Be(4); + result.Separator.Should().Be(','); + result.IsSuccess.Should().BeTrue(); + } + + [Fact] + public void InferColumnsFromMultilineInputFile() + { + // Check if we can infer the column information + // from and input file which has escaped newlines inside quotes + var dataPath = GetDataPath("multiline.csv"); + MLContext mlContext = new MLContext(); + var inputColumnInformation = new ColumnInformation(); + inputColumnInformation.LabelColumnName = @"id"; + var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation); + + // File has 3 columns: "id", "description" and "animal" + Assert.NotNull(result.ColumnInformation.LabelColumnName); + Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count); + Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count); + + Assert.Equal("id", result.ColumnInformation.LabelColumnName); + Assert.Equal("description", result.ColumnInformation.TextColumnNames.First()); + Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First()); + } } -} \ No newline at end of file +} diff --git a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj index 1a5946558a..af86334edf 100644 --- a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj +++ b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj @@ -7,10 +7,14 @@ + + + PreserveNewest + PreserveNewest diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt new file mode 100644 index 0000000000..afaf368195 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt @@ -0,0 +1,36 @@ +id,Column1,Column2,Column3 +1,this is a description, 1,2 +2,"this is a quote description",1,2 +3,"this is a quote description with double quote("")",1,2 +4,"this is a quote description with ""a pair of double quote""",1,2 +5,"this is a quote description with new line +quote",1,2 +6,"this is a quote description with +new line1 and +new line2 and empty line + +and double quote""",1,2 +7, this is a description with single quote("),1,2 +// empty line between quotes +8,"",1,2 +// single quote between quotes +9,"""",1,2 +// simply newline between quotes +10," + + + +",1,2 +// simply signle quote and newline between quotes +11," + +"""" + +"" + +"" + +",1,2 + + +