diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
index a50806f780..a9210c7369 100644
--- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
+++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
@@ -55,6 +55,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
Separators = new[] { splitInference.Separator.Value },
AllowSparse = splitInference.AllowSparse,
AllowQuoting = splitInference.AllowQuote,
+ ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
TrimWhitespace = trimWhitespace
};
@@ -91,6 +92,7 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
AllowQuoting = splitInference.AllowQuote,
AllowSparse = splitInference.AllowSparse,
Separators = new char[] { splitInference.Separator.Value },
+ ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
TrimWhitespace = trimWhitespace
};
@@ -139,6 +141,7 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
Separator = splitInference.Separator.Value,
AllowSparse = splitInference.AllowSparse,
AllowQuote = splitInference.AllowQuote,
+ ReadMultilines = splitInference.ReadMultilines,
HasHeader = hasHeader,
LabelColumnIndex = labelColumnIndex,
Label = label
diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs
index b8acb44de4..6d00054dc5 100644
--- a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs
+++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs
@@ -32,6 +32,7 @@ internal sealed class Arguments
public int MaxRowsToRead;
public uint? LabelColumnIndex;
public string Label;
+ public bool ReadMultilines;
public Arguments()
{
@@ -262,6 +263,7 @@ private static InferenceResult InferTextFileColumnTypesCore(MLContext context, I
Separators = new[] { args.Separator },
AllowSparse = args.AllowSparse,
AllowQuoting = args.AllowQuote,
+ ReadMultilines = args.ReadMultilines,
};
var textLoader = context.Data.CreateTextLoader(textLoaderOptions);
var idv = textLoader.Load(fileSource);
diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
index fe0066ab6e..d04a025626 100644
--- a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
+++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
@@ -23,14 +23,16 @@ public class ColumnSplitResult
public bool AllowQuote { get; set; }
public bool AllowSparse { get; set; }
+ public bool ReadMultilines { get; set; }
- public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount)
+ public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool readMultilines, bool allowSparse, int columnCount)
{
IsSuccess = isSuccess;
Separator = separator;
AllowQuote = allowQuote;
AllowSparse = allowSparse;
ColumnCount = columnCount;
+ ReadMultilines = readMultilines;
}
}
@@ -50,12 +52,14 @@ public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamS
{
var sparse = new[] { false, true };
var quote = new[] { true, false };
+ var tryMultiline = new[] { false, true };
var foundAny = false;
var result = default(ColumnSplitResult);
foreach (var perm in (from _allowSparse in sparse
from _allowQuote in quote
from _sep in separatorCandidates
- select new { _allowSparse, _allowQuote, _sep }))
+ from _tryMultiline in tryMultiline
+ select new { _allowSparse, _allowQuote, _sep, _tryMultiline }))
{
var options = new TextLoader.Options
{
@@ -66,7 +70,8 @@ from _sep in separatorCandidates
} },
Separators = new[] { perm._sep },
AllowQuoting = perm._allowQuote,
- AllowSparse = perm._allowSparse
+ AllowSparse = perm._allowSparse,
+ ReadMultilines = perm._tryMultiline,
};
if (TryParseFile(context, options, source, out result))
@@ -75,7 +80,7 @@ from _sep in separatorCandidates
break;
}
}
- return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0);
+ return foundAny ? result : new ColumnSplitResult(false, null, true, true, true, 0);
}
private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source,
@@ -111,7 +116,7 @@ private static bool TryParseFile(MLContext context, TextLoader.Options options,
// disallow single-column case
if (mostCommon.Key <= 1) { return false; }
- result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key);
+ result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.ReadMultilines, options.AllowSparse, mostCommon.Key);
return true;
}
// fail gracefully if unable to instantiate data view with swept arguments
diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs
index eba7b9b7b3..d1b0041472 100644
--- a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs
+++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs
@@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
+using FluentAssertions;
using Microsoft.ML.Data;
using Microsoft.ML.TestFramework;
using Xunit;
@@ -186,5 +187,39 @@ public void InferColumnsColumnInfoParam()
Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First());
Assert.Null(result.ColumnInformation.ExampleWeightColumnName);
}
+
+ [Fact]
+ public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes()
+ {
+ var context = new MLContext();
+ var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt");
+ var sample = TextFileSample.CreateFromFullFile(dataset);
+ var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators);
+
+ result.ColumnCount.Should().Be(4);
+ result.Separator.Should().Be(',');
+ result.IsSuccess.Should().BeTrue();
+ }
+
+ [Fact]
+ public void InferColumnsFromMultilineInputFile()
+ {
+ // Check if we can infer the column information
+ // from and input file which has escaped newlines inside quotes
+ var dataPath = GetDataPath("multiline.csv");
+ MLContext mlContext = new MLContext();
+ var inputColumnInformation = new ColumnInformation();
+ inputColumnInformation.LabelColumnName = @"id";
+ var result = mlContext.Auto().InferColumns(dataPath, inputColumnInformation);
+
+ // File has 3 columns: "id", "description" and "animal"
+ Assert.NotNull(result.ColumnInformation.LabelColumnName);
+ Assert.Equal(1, result.ColumnInformation.TextColumnNames.Count);
+ Assert.Equal(1, result.ColumnInformation.CategoricalColumnNames.Count);
+
+ Assert.Equal("id", result.ColumnInformation.LabelColumnName);
+ Assert.Equal("description", result.ColumnInformation.TextColumnNames.First());
+ Assert.Equal("animal", result.ColumnInformation.CategoricalColumnNames.First());
+ }
}
-}
\ No newline at end of file
+}
diff --git a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj
index 1a5946558a..af86334edf 100644
--- a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj
+++ b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj
@@ -7,10 +7,14 @@
+
+
+ PreserveNewest
+
PreserveNewest
diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt
new file mode 100644
index 0000000000..afaf368195
--- /dev/null
+++ b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithNewlineBetweenQuotes.txt
@@ -0,0 +1,36 @@
+id,Column1,Column2,Column3
+1,this is a description, 1,2
+2,"this is a quote description",1,2
+3,"this is a quote description with double quote("")",1,2
+4,"this is a quote description with ""a pair of double quote""",1,2
+5,"this is a quote description with new line
+quote",1,2
+6,"this is a quote description with
+new line1 and
+new line2 and empty line
+
+and double quote""",1,2
+7, this is a description with single quote("),1,2
+// empty line between quotes
+8,"",1,2
+// single quote between quotes
+9,"""",1,2
+// simply newline between quotes
+10,"
+
+
+
+",1,2
+// simply signle quote and newline between quotes
+11,"
+
+""""
+
+""
+
+""
+
+",1,2
+
+
+