diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
index 7ea6ab17e9..eb198427d1 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -521,6 +521,12 @@ public class Options
[Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of rows to produce", ShortName = "rows", Hide = true)]
public long? MaxRows;
+ ///
+ /// Character to use to escape quotes inside quoted fields. It can't be a character used as separator.
+ ///
+ [Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")]
+ public char EscapeChar = Defaults.EscapeChar;
+
///
/// Checks that all column specifications are valid (that is, ranges are disjoint and have min<=max).
///
@@ -538,6 +544,7 @@ internal static class Defaults
internal const bool HasHeader = false;
internal const bool TrimWhitespace = false;
internal const bool ReadMultilines = false;
+ internal const char EscapeChar = '"';
}
///
@@ -702,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List> lines = null;
if (headerFile != null)
- Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
+ Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
if (needInputSize && inputSize == 0)
- Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
+ Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
else if (headerFile == null && parent.HasHeader)
- Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);
+ Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines);
if (needInputSize && inputSize == 0)
{
@@ -1063,7 +1070,8 @@ private static VersionInfo GetVersionInfo()
// verWrittenCur: 0x00010009, // Introduced _flags
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
- verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
+ //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
+ verWrittenCur: 0x0001000D, // Added escapeChar option
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
@@ -1090,6 +1098,7 @@ private enum OptionFlags : uint
private readonly bool _useThreads;
private readonly OptionFlags _flags;
+ private readonly char _escapeChar;
private readonly long _maxRows;
// Input size is zero for unknown - determined by the data (including sparse rows).
private readonly int _inputSize;
@@ -1210,6 +1219,10 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
}
}
+ _escapeChar = options.EscapeChar;
+ if(_separators.Contains(_escapeChar))
+ throw _host.ExceptUserArg(nameof(Options.EscapeChar), "EscapeChar '{0}' can't be used both as EscapeChar and separator", _escapeChar);
+
_bindings = new Bindings(this, cols, headerFile, dataSample);
_parser = new Parser(this);
}
@@ -1373,6 +1386,7 @@ private TextLoader(IHost host, ModelLoadContext ctx)
// int: inputSize: 0 for determined from data
// int: number of separators
// char[]: separators
+ // char: escapeChar
// bindings
int cbFloat = ctx.Reader.ReadInt32();
host.CheckDecode(cbFloat == sizeof(float));
@@ -1397,6 +1411,17 @@ private TextLoader(IHost host, ModelLoadContext ctx)
if (_separators.Contains(':'))
host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0);
+ if (ctx.Header.ModelVerWritten >= 0x0001000D)
+ {
+ _escapeChar = ctx.Reader.ReadChar();
+ }
+ else
+ {
+ _escapeChar = Defaults.EscapeChar;
+ }
+
+ host.CheckDecode(!_separators.Contains(_escapeChar));
+
_bindings = new Bindings(ctx, this);
_parser = new Parser(this);
}
@@ -1437,6 +1462,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
// int: inputSize: 0 for determined from data
// int: number of separators
// char[]: separators
+ // char: escapeChar
// bindings
ctx.Writer.Write(sizeof(float));
ctx.Writer.Write(_maxRows);
@@ -1445,6 +1471,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
_host.Assert(0 <= _inputSize && _inputSize < SrcLim);
ctx.Writer.Write(_inputSize);
ctx.Writer.WriteCharArray(_separators);
+ ctx.Writer.Write(_escapeChar);
_bindings.Save(ctx);
}
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
index 62f5709169..5cdca75e86 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
@@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);
- var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
+ var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, 1);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
@@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);
- var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
+ var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, cthd);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
@@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter()
};
}
- public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines)
+ public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, char escapeChar, ref List> lines)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
@@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM
count = 2;
LineBatch batch;
- var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
+ var reader = new LineReader(source, count, 1, false, readMultilines, separators, escapeChar, count, 1);
try
{
batch = reader.GetBatch();
@@ -404,6 +404,7 @@ private sealed class LineReader
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly char[] _separators;
+ private readonly char _escapeChar;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;
@@ -413,7 +414,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;
- public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
+ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, char escapeChar, long limit, int cref)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
@@ -428,6 +429,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_batchSize = batchSize;
_readMultilines = readMultilines;
_separators = separators;
+ _escapeChar = escapeChar;
_files = files;
_cref = cref;
@@ -474,15 +476,19 @@ private class MultiLineReader
private readonly char _sep0;
private readonly char[] _separators;
private readonly bool _sepsContainsSpace;
+ private readonly char _escapeChar;
+ private readonly bool _escapeCharIsDoubleQuote;
private readonly StringBuilder _sb;
private readonly TextReader _rdr;
- public MultiLineReader(TextReader rdr, char[] separators)
+ public MultiLineReader(TextReader rdr, char[] separators, char escapeChar)
{
Contracts.AssertNonEmpty(separators);
_sep0 = separators[0];
_separators = separators;
_sepsContainsSpace = IsSep(' ');
+ _escapeChar = escapeChar;
+ _escapeCharIsDoubleQuote = (escapeChar == '"');
_sb = new StringBuilder();
_rdr = rdr;
}
@@ -569,6 +575,9 @@ private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim,
ichCur++;
}
+ if (ichCur >= ichLim) // if there were only leading spaces on the line
+ return startsInsideQuoted;
+
if(startsInsideQuoted || line[ichCur] == '"')
{
// Quoted Field Case
@@ -576,45 +585,76 @@ private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim,
if (!startsInsideQuoted)
ichCur++;
- for (; ; ichCur++)
+ if (_escapeCharIsDoubleQuote)
{
- if (ichCur >= ichLim)
- // We've reached the end of the line without finding the closing quote,
- // so next line will start on this quoted field
- return true;
-
- if (line[ichCur] == '"')
+ for (; ; ichCur++)
{
- if (++ichCur >= ichLim)
- // Last character in line was the closing quote of the field
- return false;
+ if (ichCur >= ichLim)
+ // We've reached the end of the line without finding the closing quote,
+ // so next line will start on this quoted field
+ return true;
if (line[ichCur] == '"')
- // 2 Double quotes means escaped quote
- continue;
+ {
+ if (++ichCur >= ichLim)
+ // Last character in line was the closing quote of the field
+ return false;
- // If it wasn't an escaped quote, then this is supposed to be
- // the closing quote of the field, and there should only be spaces remaining
- // until the next separator.
+ if (line[ichCur] == '"')
+ // 2 Double quotes means escaped quote
+ continue;
- if (!_sepsContainsSpace)
- {
- // Ignore leading spaces
- while (ichCur < ichLim && line[ichCur] == ' ')
- ichCur++;
+ // If it wasn't an escaped quote, then this is supposed to be
+ // the closing quote of the field
+ break;
}
+ }
+ }
+ else
+ {
+ for (; ; ichCur++)
+ {
+ if (ichCur >= ichLim)
+ // We've reached the end of the line without finding the closing quote,
+ // so next line will start on this quoted field
+ return true;
- // If there's anything else than spaces or the next separator,
- // this will actually be a QuotingError on the parser, so we decide that this
- // line contains a quoting error, and so it's not going to be considered a valid field
- // and the rest of the line should be ignored.
- if (ichCur >= ichLim || IsSep(line[ichCur]))
- return false;
+ if (line[ichCur] == _escapeChar)
+ {
+ if (++ichCur >= ichLim)
+ // Last character in line was escapeChar
+ return true;
- quotingError = true;
- return false;
+ // Whatever char comes after an escapeChar is ignored
+ continue;
+ }
+ else if (line[ichCur] == '"')
+ {
+ // Since this wasn't an escaped quote, then this is supposed to be
+ // the closing quote of the field
+ break;
+ }
}
}
+
+ // After finding the closing quote of the field...
+ // There should only be empty spaces until the next separator
+ if (!_sepsContainsSpace)
+ {
+ // Ignore leading spaces
+ while (ichCur < ichLim && line[ichCur] == ' ')
+ ichCur++;
+ }
+
+ // If there's anything else than spaces or the next separator,
+ // this will actually be a QuotingError on the parser, so we decide that this
+ // line contains a quoting error, and so it's not going to be considered a valid field
+ // and the rest of the line should be ignored.
+ if (ichCur >= ichLim || IsSep(line[ichCur]))
+ return false;
+
+ quotingError = true;
+ return false;
}
// Unquoted field case.
@@ -655,7 +695,7 @@ private void ThreadProc()
string path = _files.GetPathOrNull(ifile);
using (var rdr = _files.OpenTextReader(ifile))
{
- var multilineReader = new MultiLineReader(rdr, _separators);
+ var multilineReader = new MultiLineReader(rdr, _separators, _escapeChar);
string text;
long line = 0;
for (; ; )
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
index 13019c4bf2..a6f8b73ba6 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
@@ -634,6 +634,7 @@ public void Clear()
private readonly char[] _separators;
private readonly OptionFlags _flags;
+ private readonly char _escapeChar;
private readonly int _inputSize;
private readonly ColInfo[] _infos;
@@ -684,6 +685,7 @@ public Parser(TextLoader parent)
_separators = parent._separators;
_flags = parent._flags;
+ _escapeChar = parent._escapeChar;
_inputSize = parent._inputSize;
Contracts.Assert(_inputSize >= 0);
}
@@ -696,7 +698,7 @@ public static void GetInputSize(TextLoader parent, List> li
minSize = int.MaxValue;
maxSize = 0;
var stats = new ParseStats(parent._host, cref: 1, maxShow: 0);
- var impl = new HelperImpl(stats, parent._flags, parent._separators, 0, int.MaxValue);
+ var impl = new HelperImpl(stats, parent._flags, parent._separators, parent._escapeChar, 0, int.MaxValue);
try
{
foreach (var line in lines)
@@ -732,7 +734,7 @@ public static void ParseSlotNames(TextLoader parent, ReadOnlyMemory textHe
var sb = new StringBuilder();
var stats = new ParseStats(parent._host, cref: 1, maxShow: 0);
- var impl = new HelperImpl(stats, parent._flags, parent._separators, parent._inputSize, int.MaxValue);
+ var impl = new HelperImpl(stats, parent._flags, parent._separators, parent._escapeChar, parent._inputSize, int.MaxValue);
try
{
impl.GatherFields(textHeader, textHeader.Span);
@@ -848,7 +850,7 @@ public Helper CreateHelper(ParseStats stats, int srcNeeded)
{
Contracts.AssertValue(stats);
Contracts.Assert(srcNeeded >= 0);
- return new HelperImpl(stats, _flags, _separators, _inputSize, srcNeeded);
+ return new HelperImpl(stats, _flags, _separators, _escapeChar, _inputSize, srcNeeded);
}
///
@@ -867,6 +869,7 @@ private sealed class HelperImpl : Helper
private readonly char _sep0;
private readonly char _sep1;
private readonly bool _sepContainsSpace;
+ private readonly char _escapeChar;
private readonly int _inputSize;
private readonly int _srcNeeded;
private readonly bool _quoting;
@@ -879,7 +882,7 @@ private sealed class HelperImpl : Helper
public readonly FieldSet Fields;
- public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, int inputSize, int srcNeeded)
+ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeChar, int inputSize, int srcNeeded)
{
Contracts.AssertValue(stats);
// inputSize == 0 means unknown.
@@ -893,6 +896,7 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, int inputSiz
_sep0 = _seps[0];
_sep1 = _seps.Length > 1 ? _seps[1] : '\0';
_sepContainsSpace = IsSep(' ');
+ _escapeChar = escapeChar;
_inputSize = inputSize;
_srcNeeded = srcNeeded;
_quoting = (flags & OptionFlags.AllowQuoting) != 0;
@@ -1152,29 +1156,74 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan span)
ichCur++;
_sb.Clear();
int ichRun = ichCur;
- for (; ; ichCur++)
+ if (_escapeChar == '"')
{
- Contracts.Assert(ichCur <= ichLim);
- if (ichCur >= ichLim)
+ for (; ; ichCur++)
{
- // Missing close quote!
- scan.QuotingError = true;
- break;
- }
+ Contracts.Assert(ichCur <= ichLim);
+ if (ichCur >= ichLim)
+ {
+ // Missing close quote!
+ scan.QuotingError = true;
+ break;
+ }
- // The logic below allow us to escape quotes (") inside quoted
- // fields by using doublo quotes (""). I.e. when the loader
- // encounters "" inside a quoted field, it will output only one "
- // and continue parsing the rest of the field.
- if (span[ichCur] == '"')
+ // The logic below allow us to escape double quotes (") inside quoted
+ // fields by using 2 double quotes (""). I.e. when the loader
+ // encounters "" inside a quoted field, it will output only one "
+ // and continue parsing the rest of the field.
+ if (span[ichCur] == '"')
+ {
+ if (ichCur > ichRun)
+ _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun));
+ if (++ichCur >= ichLim)
+ break;
+ if (span[ichCur] != '"')
+ break;
+ ichRun = ichCur;
+ }
+ }
+ }
+ else
+ {
+ for (; ; ichCur++)
{
- if (ichCur > ichRun)
- _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun));
- if (++ichCur >= ichLim)
+ Contracts.Assert(ichCur <= ichLim);
+ if (ichCur >= ichLim)
+ {
+ // Missing close quote!
+ scan.QuotingError = true;
break;
- if (span[ichCur] != '"')
+ }
+
+ if (span[ichCur] == _escapeChar)
+ {
+ ichCur++;
+ if (ichCur >= ichLim)
+ {
+ // Missing close quote!
+ scan.QuotingError = true;
+ break;
+ }
+
+ if (span[ichCur] == '"')
+ {
+ // Don't include escapeChar in span
+ _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun - 1));
+ ichRun = ichCur;
+ }
+
+ continue;
+ }
+
+ if (span[ichCur] == '"')
+ {
+ if (ichCur > ichRun)
+ _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun));
+
+ ichCur++;
break;
- ichRun = ichCur;
+ }
}
}
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index 67033afde3..84e8b329b9 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -440,6 +440,18 @@
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
+ },
+ {
+ "Name": "EscapeChar",
+ "Type": "Char",
+ "Desc": "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.",
+ "Aliases": [
+ "escapechar"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "\""
}
]
},
diff --git a/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs b/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs
index 5026296eab..4632ca5ed6 100644
--- a/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs
+++ b/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs
@@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
+using System;
using System.IO;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
@@ -62,4 +63,68 @@ public static string GetBenchmarkDataPathAndEnsureData(string name, string path
return filePath;
}
}
+
+ public class RandomFile
+ {
+ public static string CreateRandomFile(string path, int numRows, int numColumns, int maxWordLength)
+ {
+ // Create file with random strings
+ // to use as dataset of the benchmark
+
+ Random random = new Random(1);
+
+ using (StreamWriter file = new StreamWriter(path))
+ {
+ for (int i = 0; i < numRows; i++)
+ file.WriteLine(CreateRandomLine(numColumns, maxWordLength, random));
+ }
+ return path;
+ }
+
+ public static string CreateRandomLine(int columns, int maxWordLength, Random random)
+ {
+ var lineSB = new System.Text.StringBuilder();
+ for (int i = 0; i < columns; i++)
+ {
+ lineSB.Append(CreateRandomColumn(random.Next(100), maxWordLength, random));
+ lineSB.Append(",");
+ }
+ return lineSB.ToString();
+ }
+
+ public static string CreateRandomColumn(int numwords, int maxWordLength, Random random)
+ {
+ const string characters =
+ "01234567890" +
+ "abcdefghijklmnopqrstuvwxyz" +
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+ var columnSB = new System.Text.StringBuilder();
+ int wordLength;
+
+ bool quoted = false;
+ if (random.NextDouble() > 0.5)
+ {
+ quoted = true;
+ columnSB.Append('"');
+ }
+
+ for (int i = 0; i < numwords; i++)
+ {
+ wordLength = random.Next(1, maxWordLength);
+ for (int j = 0; j < wordLength; j++)
+ columnSB.Append(characters[random.Next(characters.Length)]);
+
+ columnSB.Append(" ");
+ }
+
+ if (quoted)
+ columnSB.Append('"');
+
+ if (random.Next(2) == 0) // sometimes return the column as lowercase
+ return columnSB.ToString().ToLower();
+
+ return columnSB.ToString();
+ }
+ }
}
diff --git a/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs b/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs
index 39aa0d6feb..46496c5100 100644
--- a/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs
+++ b/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs
@@ -29,7 +29,7 @@ public void SetupData()
_mlContext = new MLContext(seed: 1);
var path = Path.GetTempFileName();
Console.WriteLine($"Created dataset in temporary file:\n{path}\n");
- path = CreateRandomFile(path);
+ path = RandomFile.CreateRandomFile(path, _numRows, _numColumns, _maxWordLength);
var columns = new List();
for(int i = 0; i < _numColumns; i++)
@@ -41,7 +41,8 @@ public void SetupData()
{
Columns = columns.ToArray(),
HasHeader = false,
- Separators = new char[] { ',' }
+ Separators = new char[] { ',' },
+ AllowQuoting = true
});
_dataset = textLoader.Load(path);
@@ -116,56 +117,5 @@ public ITransformer TrainFeaturizeText()
return model;
}
-
- public static string CreateRandomFile(string path)
- {
- // Create file with random strings
- // to use as dataset of the benchmark
-
- Random random = new Random(1);
-
- using (StreamWriter file = new StreamWriter(path))
- {
- for(int i = 0; i < _numRows; i++)
- file.WriteLine(CreateRandomLine(_numColumns, random));
- }
- return path;
- }
-
- public static string CreateRandomLine(int columns, Random random)
- {
- var lineSB = new System.Text.StringBuilder();
- for(int i = 0; i < columns; i++)
- {
- lineSB.Append(CreateRandomColumn(random, random.Next(100)));
- lineSB.Append(",");
- }
- return lineSB.ToString();
- }
-
- public static string CreateRandomColumn(Random random, int numwords)
- {
- const string characters =
- "01234567890" +
- "abcdefghijklmnopqrstuvwxyz" +
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
-
- var columnSB = new System.Text.StringBuilder();
- int wordLength;
-
- for(int i = 0; i < numwords; i++)
- {
- wordLength = random.Next(1, _maxWordLength);
- for(int j = 0; j < wordLength; j++)
- columnSB.Append(characters[random.Next(characters.Length)]);
-
- columnSB.Append(" ");
- }
-
- if (random.Next(2) == 0) // sometimes return the column as lowercase
- return columnSB.ToString().ToLower();
-
- return columnSB.ToString();
- }
}
}
diff --git a/test/Microsoft.ML.Benchmarks/TextLoaderBench.cs b/test/Microsoft.ML.Benchmarks/TextLoaderBench.cs
new file mode 100644
index 0000000000..8f1faf6b76
--- /dev/null
+++ b/test/Microsoft.ML.Benchmarks/TextLoaderBench.cs
@@ -0,0 +1,107 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.IO;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Data;
+using BenchmarkDotNet.Attributes;
+using Microsoft.ML.Transforms.Text;
+using Xunit;
+
+namespace Microsoft.ML.Benchmarks
+{
+ [Config(typeof(TrainConfig))]
+ public class TextLoaderBench : BenchmarkBase
+ {
+ private MLContext _mlContext;
+ private IDataView _dataView;
+ private static int _numColumns = 100;
+ private static int _numRows = 3000;
+ private static int _maxWordLength = 15;
+ private static int _numColumnsToGet = 20;
+ private List _columns;
+
+
+ [GlobalSetup]
+ public void SetupData()
+ {
+ Path.GetTempFileName();
+ _mlContext = new MLContext(seed: 1);
+ var path = Path.GetTempFileName();
+ Console.WriteLine($"Created dataset in temporary file:\n{path}\n");
+ path = RandomFile.CreateRandomFile(path, _numRows, _numColumns, _maxWordLength);
+
+ _columns = new List();
+ for(int i = 0; i < _numColumns; i++)
+ {
+ _columns.Add(new TextLoader.Column($"Column{i}", DataKind.String, i));
+ }
+
+ var textLoader = _mlContext.Data.CreateTextLoader(new TextLoader.Options()
+ {
+ Columns = _columns.ToArray(),
+ HasHeader = false,
+ Separators = new char[] { ',' },
+ AllowQuoting = true,
+ ReadMultilines = true,
+ EscapeChar = '\\',
+ });
+
+ _dataView = textLoader.Load(path);
+ }
+
+ [Benchmark]
+ public void TestTextLoaderGetters()
+ {
+ using(var rowCursor = _dataView.GetRowCursorForAllColumns())
+ {
+ var getters = new List>>();
+ for (int i = 0; i < _numColumnsToGet; i++)
+ {
+ getters.Add(rowCursor.GetGetter>(_dataView.Schema[i]));
+ }
+
+ ReadOnlyMemory buff = default;
+ while (rowCursor.MoveNext())
+ {
+ for (int i = 0; i < _numColumnsToGet; i++)
+ getters[i](ref buff);
+ }
+ }
+
+ //* Summary *
+
+ //BenchmarkDotNet = v0.12.0, OS = Windows 10.0.18363
+ //Intel Core i7 - 8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
+ //.NET Core SDK = 3.1.100 - preview3 - 014645
+ // [Host] : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), X64 RyuJIT
+ // Job - XQBLAM : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), X64 RyuJIT
+
+ //Arguments =/ p:Configuration = Release Toolchain = netcoreapp2.1 IterationCount = 1
+ //LaunchCount = 3 MaxIterationCount = 20 RunStrategy = ColdStart
+ //UnrollFactor = 1 WarmupCount = 1
+
+ //| Method | Mean | Error | StdDev | Extra Metric |
+ //| ---------------------- | --------: | ---------:| ---------:| -------------:|
+ //| TestTextLoaderGetters | 1.012 s | 0.6649 s | 0.0364 s | - |
+
+ //// * Legends *
+ //Mean : Arithmetic mean of all measurements
+ //Error : Half of 99.9 % confidence interval
+ // StdDev : Standard deviation of all measurements
+ // Extra Metric: Value of the provided extra metric
+ // 1 s: 1 Second(1 sec)
+
+ //// ***** BenchmarkRunner: End *****
+ //// ** Remained 0 benchmark(s) to run **
+ // Run time: 00:00:16(16.05 sec), executed benchmarks: 1
+
+ //Global total time: 00:00:33(33.18 sec), executed benchmarks: 1
+
+ return;
+ }
+ }
+}
diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs
index 6e5745d932..c1d8c941a2 100644
--- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs
+++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs
@@ -2154,8 +2154,8 @@ public void SavePipeChooseColumnsByIndex()
[Fact()]
public void SavePipeTextLoaderWithMultilines()
{
- string dataPath = GetDataPath("multiline.csv");
- const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ col=id:Num:0 col=description:TX:1 col=animal:TX:2}";
+ string dataPath = GetDataPath("multiline-escapechar.csv");
+ const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ escapechar=\\ col=id:Num:0 col=description:TX:1 col=animal:TX:2}";
OutputPath modelPath = ModelPath();
string extraArgs = null;
diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs
index b2421bacce..4768f2d82c 100644
--- a/test/Microsoft.ML.Tests/TextLoaderTests.cs
+++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs
@@ -803,6 +803,43 @@ public void TestTextLoaderKeyTypeBackCompat()
}
}
+ [Fact]
+ public void TestTextLoaderBackCompat_VerWritt_0x0001000C()
+ {
+ // Checks backward compatibility with a text loader created with "verWrittenCur: 0x0001000C"
+ // Model generated with:
+ // loader=text{header+ col=SepalLength:Num:0 col=SepalWidth:Num:1 col=PetalLength:Num:2 col=PetalWidth:Num:2 col=Cat:TX:1-8 col=Num:9-14 col=Type:TX:4}
+ var mlContext = new MLContext(1);
+ string textLoaderModelPath = GetDataPath("backcompat/textloader_VerWritt_0x0001000C.zip");
+ string irisPath = GetDataPath(TestDatasets.irisData.trainFilename);
+
+ IDataView iris;
+ using (FileStream modelfs = File.OpenRead(textLoaderModelPath))
+ using (var rep = RepositoryReader.Open(modelfs, mlContext))
+ {
+ iris = ModelFileUtils.LoadLoader(mlContext, rep, new MultiFileSource(irisPath), false);
+ }
+
+ var previewIris = iris.Preview(1);
+ var irisFirstRow = new Dictionary();
+ irisFirstRow["SepalLength"] = 5.1f;
+ irisFirstRow["SepalWidth"] = 3.5f;
+ irisFirstRow["PetalLength"] = 1.4f;
+ irisFirstRow["PetalWidth"] = 0.2f;
+
+ Assert.Equal(5, previewIris.ColumnView.Length);
+ Assert.Equal("SepalLength", previewIris.Schema[0].Name);
+ Assert.Equal(NumberDataViewType.Single, previewIris.Schema[0].Type);
+ int index = 0;
+ foreach (var entry in irisFirstRow)
+ {
+ Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key);
+ Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value);
+ }
+ Assert.Equal("Type", previewIris.RowView[0].Values[index].Key);
+ Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString());
+ }
+
private class IrisNoFields
{
}
@@ -939,12 +976,20 @@ public void TestLoadTextWithoutKeyTypeAttribute()
}
[Theory]
- [InlineData(true)]
- [InlineData(false)]
- public void TestLoadTextWithEscapedNewLines(bool useSaved)
+ [InlineData(true, false)]
+ [InlineData(false, false)]
+ [InlineData(true, true)]
+ [InlineData(false, true)]
+ public void TestLoadTextWithEscapedNewLinesAndEscapeChar(bool useSaved, bool useCustomEscapeChar)
{
var mlContext = new MLContext(seed: 1);
- var dataPath = GetDataPath("multiline.csv");
+ string dataPath;
+
+ if (!useCustomEscapeChar)
+ dataPath = GetDataPath("multiline.csv");
+ else
+ dataPath = GetDataPath("multiline-escapechar.csv");
+
var baselinePath = GetBaselinePath("TextLoader", "multiline.csv");
var options = new TextLoader.Options()
{
@@ -952,6 +997,7 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved)
Separator = ",",
AllowQuoting = true,
ReadMultilines = true,
+ EscapeChar = useCustomEscapeChar ? '\\' : TextLoader.Defaults.EscapeChar,
Columns = new[]
{
new TextLoader.Column("id", DataKind.Int32, 0),
@@ -962,16 +1008,23 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved)
var data = mlContext.Data.LoadFromTextFile(dataPath, options);
if (useSaved)
- {
+ {
// Check that loading the data view from a text file,
// and then saving that data view to another text file, then loading it again
// also matches the baseline.
- var savedPath = DeleteOutputPath("saved-multiline.tsv");
+ string savedPath;
+
+ if (!useCustomEscapeChar)
+ savedPath = DeleteOutputPath("multiline-saved.tsv");
+ else
+ savedPath = DeleteOutputPath("multiline-escapechar-saved.tsv");
+
using (var fs = File.Create(savedPath))
mlContext.Data.SaveAsText(data, fs, separatorChar: '\t');
options.Separator = "\t";
+ options.EscapeChar = '"'; // TextSaver always uses " as escape char
data = mlContext.Data.LoadFromTextFile(savedPath, options);
}
diff --git a/test/data/backcompat/textloader_VerWritt_0x0001000C.zip b/test/data/backcompat/textloader_VerWritt_0x0001000C.zip
new file mode 100644
index 0000000000..90ffda9458
Binary files /dev/null and b/test/data/backcompat/textloader_VerWritt_0x0001000C.zip differ
diff --git a/test/data/multiline-escapechar.csv b/test/data/multiline-escapechar.csv
new file mode 100644
index 0000000000..b9517a52c5
--- /dev/null
+++ b/test/data/multiline-escapechar.csv
@@ -0,0 +1,34 @@
+// this file should be loaded with quoting and readmultiline enabled
+// and it should load without problems by the TextLoader
+id,description,animal
+// this is a comment that will be ignored
+// this is a comment with "quotes" that will also be ignored
+// this is a comment with a "quote without close quote that will also be ignored
+10,this is a description,dog
+11,"this is a quoted description",cat
+12,"this is a multiline
+quoted description", bird
+13,"this has one\"doublequote which should be escaped as a single quote",dog
+14,"this has \"doublequotes\" inside of it",cat
+15, "this is a multiline
+quoted description with
+
+\"doublequotes\" and
+
+empty new lines and
+
+
+
+escaped quotes inside of \"it\"
+
+//and this comment with // shouldn't be ignored
+since it is part of the \"multiline\"",bird
+// this line should be ignored, and the next line is empty:
+
+16, here is text after the empty line, dog
+17, this is a line with an empty animal,""
+"", this is a line with an empty id, bird
+19,"",dog
+20,we also allow"quotes in the middle of fields,cat
+21,or also at the end",bird
+22,this is the last row description,dog
\ No newline at end of file