diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 7ea6ab17e9..eb198427d1 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -521,6 +521,12 @@ public class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of rows to produce", ShortName = "rows", Hide = true)] public long? MaxRows; + /// + /// Character to use to escape quotes inside quoted fields. It can't be a character used as separator. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", ShortName = "escapechar")] + public char EscapeChar = Defaults.EscapeChar; + /// /// Checks that all column specifications are valid (that is, ranges are disjoint and have min<=max). /// @@ -538,6 +544,7 @@ internal static class Defaults internal const bool HasHeader = false; internal const bool TrimWhitespace = false; internal const bool ReadMultilines = false; + internal const char EscapeChar = '"'; } /// @@ -702,11 +709,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile, ch.Assert(0 <= inputSize & inputSize < SrcLim); List> lines = null; if (headerFile != null) - Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines); + Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines); if (needInputSize && inputSize == 0) - Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines); + Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines); else if (headerFile == null && parent.HasHeader) - Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines); + Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, parent._escapeChar, ref lines); if (needInputSize && inputSize == 0) { @@ -1063,7 +1070,8 @@ private static VersionInfo GetVersionInfo() // verWrittenCur: 0x00010009, // Introduced _flags //verWrittenCur: 0x0001000A, // Added ForceVector in Range //verWrittenCur: 0x0001000B, // Header now retained if used and present - verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags + //verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags + verWrittenCur: 0x0001000D, // Added escapeChar option verReadableCur: 0x0001000A, verWeCanReadBack: 0x00010009, loaderSignature: LoaderSignature, @@ -1090,6 +1098,7 @@ private enum OptionFlags : uint private readonly bool _useThreads; private readonly OptionFlags _flags; + private readonly char _escapeChar; private readonly long _maxRows; // Input size is zero for unknown - determined by the data (including sparse rows). private readonly int _inputSize; @@ -1210,6 +1219,10 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo } } + _escapeChar = options.EscapeChar; + if(_separators.Contains(_escapeChar)) + throw _host.ExceptUserArg(nameof(Options.EscapeChar), "EscapeChar '{0}' can't be used both as EscapeChar and separator", _escapeChar); + _bindings = new Bindings(this, cols, headerFile, dataSample); _parser = new Parser(this); } @@ -1373,6 +1386,7 @@ private TextLoader(IHost host, ModelLoadContext ctx) // int: inputSize: 0 for determined from data // int: number of separators // char[]: separators + // char: escapeChar // bindings int cbFloat = ctx.Reader.ReadInt32(); host.CheckDecode(cbFloat == sizeof(float)); @@ -1397,6 +1411,17 @@ private TextLoader(IHost host, ModelLoadContext ctx) if (_separators.Contains(':')) host.CheckDecode((_flags & OptionFlags.AllowSparse) == 0); + if (ctx.Header.ModelVerWritten >= 0x0001000D) + { + _escapeChar = ctx.Reader.ReadChar(); + } + else + { + _escapeChar = Defaults.EscapeChar; + } + + host.CheckDecode(!_separators.Contains(_escapeChar)); + _bindings = new Bindings(ctx, this); _parser = new Parser(this); } @@ -1437,6 +1462,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx) // int: inputSize: 0 for determined from data // int: number of separators // char[]: separators + // char: escapeChar // bindings ctx.Writer.Write(sizeof(float)); ctx.Writer.Write(_maxRows); @@ -1445,6 +1471,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx) _host.Assert(0 <= _inputSize && _inputSize < SrcLim); ctx.Writer.Write(_inputSize); ctx.Writer.WriteCharArray(_separators); + ctx.Writer.Write(_escapeChar); _bindings.Save(ctx); } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 62f5709169..5cdca75e86 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -146,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil SetupCursor(parent, active, 0, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, 1); var stats = new ParseStats(parent._host, 1); return new Cursor(parent, stats, active, reader, srcNeeded, cthd); } @@ -163,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc SetupCursor(parent, active, n, out srcNeeded, out cthd); Contracts.Assert(cthd > 0); - var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd); + var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._escapeChar, parent._maxRows, cthd); var stats = new ParseStats(parent._host, cthd); if (cthd <= 1) return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; @@ -205,7 +205,7 @@ public override ValueGetter GetIdGetter() }; } - public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List> lines) + public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, char escapeChar, ref List> lines) { Contracts.AssertValue(source); Contracts.Assert(count > 0); @@ -215,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, bool readM count = 2; LineBatch batch; - var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1); + var reader = new LineReader(source, count, 1, false, readMultilines, separators, escapeChar, count, 1); try { batch = reader.GetBatch(); @@ -404,6 +404,7 @@ private sealed class LineReader private readonly bool _hasHeader; private readonly bool _readMultilines; private readonly char[] _separators; + private readonly char _escapeChar; private readonly int _batchSize; private readonly IMultiStreamSource _files; @@ -413,7 +414,7 @@ private sealed class LineReader private Task _thdRead; private volatile bool _abort; - public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref) + public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, char escapeChar, long limit, int cref) { // Note that files is allowed to be empty. Contracts.AssertValue(files); @@ -428,6 +429,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has _batchSize = batchSize; _readMultilines = readMultilines; _separators = separators; + _escapeChar = escapeChar; _files = files; _cref = cref; @@ -474,15 +476,19 @@ private class MultiLineReader private readonly char _sep0; private readonly char[] _separators; private readonly bool _sepsContainsSpace; + private readonly char _escapeChar; + private readonly bool _escapeCharIsDoubleQuote; private readonly StringBuilder _sb; private readonly TextReader _rdr; - public MultiLineReader(TextReader rdr, char[] separators) + public MultiLineReader(TextReader rdr, char[] separators, char escapeChar) { Contracts.AssertNonEmpty(separators); _sep0 = separators[0]; _separators = separators; _sepsContainsSpace = IsSep(' '); + _escapeChar = escapeChar; + _escapeCharIsDoubleQuote = (escapeChar == '"'); _sb = new StringBuilder(); _rdr = rdr; } @@ -569,6 +575,9 @@ private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim, ichCur++; } + if (ichCur >= ichLim) // if there were only leading spaces on the line + return startsInsideQuoted; + if(startsInsideQuoted || line[ichCur] == '"') { // Quoted Field Case @@ -576,45 +585,76 @@ private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim, if (!startsInsideQuoted) ichCur++; - for (; ; ichCur++) + if (_escapeCharIsDoubleQuote) { - if (ichCur >= ichLim) - // We've reached the end of the line without finding the closing quote, - // so next line will start on this quoted field - return true; - - if (line[ichCur] == '"') + for (; ; ichCur++) { - if (++ichCur >= ichLim) - // Last character in line was the closing quote of the field - return false; + if (ichCur >= ichLim) + // We've reached the end of the line without finding the closing quote, + // so next line will start on this quoted field + return true; if (line[ichCur] == '"') - // 2 Double quotes means escaped quote - continue; + { + if (++ichCur >= ichLim) + // Last character in line was the closing quote of the field + return false; - // If it wasn't an escaped quote, then this is supposed to be - // the closing quote of the field, and there should only be spaces remaining - // until the next separator. + if (line[ichCur] == '"') + // 2 Double quotes means escaped quote + continue; - if (!_sepsContainsSpace) - { - // Ignore leading spaces - while (ichCur < ichLim && line[ichCur] == ' ') - ichCur++; + // If it wasn't an escaped quote, then this is supposed to be + // the closing quote of the field + break; } + } + } + else + { + for (; ; ichCur++) + { + if (ichCur >= ichLim) + // We've reached the end of the line without finding the closing quote, + // so next line will start on this quoted field + return true; - // If there's anything else than spaces or the next separator, - // this will actually be a QuotingError on the parser, so we decide that this - // line contains a quoting error, and so it's not going to be considered a valid field - // and the rest of the line should be ignored. - if (ichCur >= ichLim || IsSep(line[ichCur])) - return false; + if (line[ichCur] == _escapeChar) + { + if (++ichCur >= ichLim) + // Last character in line was escapeChar + return true; - quotingError = true; - return false; + // Whatever char comes after an escapeChar is ignored + continue; + } + else if (line[ichCur] == '"') + { + // Since this wasn't an escaped quote, then this is supposed to be + // the closing quote of the field + break; + } } } + + // After finding the closing quote of the field... + // There should only be empty spaces until the next separator + if (!_sepsContainsSpace) + { + // Ignore leading spaces + while (ichCur < ichLim && line[ichCur] == ' ') + ichCur++; + } + + // If there's anything else than spaces or the next separator, + // this will actually be a QuotingError on the parser, so we decide that this + // line contains a quoting error, and so it's not going to be considered a valid field + // and the rest of the line should be ignored. + if (ichCur >= ichLim || IsSep(line[ichCur])) + return false; + + quotingError = true; + return false; } // Unquoted field case. @@ -655,7 +695,7 @@ private void ThreadProc() string path = _files.GetPathOrNull(ifile); using (var rdr = _files.OpenTextReader(ifile)) { - var multilineReader = new MultiLineReader(rdr, _separators); + var multilineReader = new MultiLineReader(rdr, _separators, _escapeChar); string text; long line = 0; for (; ; ) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs index 13019c4bf2..a6f8b73ba6 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs @@ -634,6 +634,7 @@ public void Clear() private readonly char[] _separators; private readonly OptionFlags _flags; + private readonly char _escapeChar; private readonly int _inputSize; private readonly ColInfo[] _infos; @@ -684,6 +685,7 @@ public Parser(TextLoader parent) _separators = parent._separators; _flags = parent._flags; + _escapeChar = parent._escapeChar; _inputSize = parent._inputSize; Contracts.Assert(_inputSize >= 0); } @@ -696,7 +698,7 @@ public static void GetInputSize(TextLoader parent, List> li minSize = int.MaxValue; maxSize = 0; var stats = new ParseStats(parent._host, cref: 1, maxShow: 0); - var impl = new HelperImpl(stats, parent._flags, parent._separators, 0, int.MaxValue); + var impl = new HelperImpl(stats, parent._flags, parent._separators, parent._escapeChar, 0, int.MaxValue); try { foreach (var line in lines) @@ -732,7 +734,7 @@ public static void ParseSlotNames(TextLoader parent, ReadOnlyMemory textHe var sb = new StringBuilder(); var stats = new ParseStats(parent._host, cref: 1, maxShow: 0); - var impl = new HelperImpl(stats, parent._flags, parent._separators, parent._inputSize, int.MaxValue); + var impl = new HelperImpl(stats, parent._flags, parent._separators, parent._escapeChar, parent._inputSize, int.MaxValue); try { impl.GatherFields(textHeader, textHeader.Span); @@ -848,7 +850,7 @@ public Helper CreateHelper(ParseStats stats, int srcNeeded) { Contracts.AssertValue(stats); Contracts.Assert(srcNeeded >= 0); - return new HelperImpl(stats, _flags, _separators, _inputSize, srcNeeded); + return new HelperImpl(stats, _flags, _separators, _escapeChar, _inputSize, srcNeeded); } /// @@ -867,6 +869,7 @@ private sealed class HelperImpl : Helper private readonly char _sep0; private readonly char _sep1; private readonly bool _sepContainsSpace; + private readonly char _escapeChar; private readonly int _inputSize; private readonly int _srcNeeded; private readonly bool _quoting; @@ -879,7 +882,7 @@ private sealed class HelperImpl : Helper public readonly FieldSet Fields; - public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, int inputSize, int srcNeeded) + public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, char escapeChar, int inputSize, int srcNeeded) { Contracts.AssertValue(stats); // inputSize == 0 means unknown. @@ -893,6 +896,7 @@ public HelperImpl(ParseStats stats, OptionFlags flags, char[] seps, int inputSiz _sep0 = _seps[0]; _sep1 = _seps.Length > 1 ? _seps[1] : '\0'; _sepContainsSpace = IsSep(' '); + _escapeChar = escapeChar; _inputSize = inputSize; _srcNeeded = srcNeeded; _quoting = (flags & OptionFlags.AllowQuoting) != 0; @@ -1152,29 +1156,74 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan span) ichCur++; _sb.Clear(); int ichRun = ichCur; - for (; ; ichCur++) + if (_escapeChar == '"') { - Contracts.Assert(ichCur <= ichLim); - if (ichCur >= ichLim) + for (; ; ichCur++) { - // Missing close quote! - scan.QuotingError = true; - break; - } + Contracts.Assert(ichCur <= ichLim); + if (ichCur >= ichLim) + { + // Missing close quote! + scan.QuotingError = true; + break; + } - // The logic below allow us to escape quotes (") inside quoted - // fields by using doublo quotes (""). I.e. when the loader - // encounters "" inside a quoted field, it will output only one " - // and continue parsing the rest of the field. - if (span[ichCur] == '"') + // The logic below allow us to escape double quotes (") inside quoted + // fields by using 2 double quotes (""). I.e. when the loader + // encounters "" inside a quoted field, it will output only one " + // and continue parsing the rest of the field. + if (span[ichCur] == '"') + { + if (ichCur > ichRun) + _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun)); + if (++ichCur >= ichLim) + break; + if (span[ichCur] != '"') + break; + ichRun = ichCur; + } + } + } + else + { + for (; ; ichCur++) { - if (ichCur > ichRun) - _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun)); - if (++ichCur >= ichLim) + Contracts.Assert(ichCur <= ichLim); + if (ichCur >= ichLim) + { + // Missing close quote! + scan.QuotingError = true; break; - if (span[ichCur] != '"') + } + + if (span[ichCur] == _escapeChar) + { + ichCur++; + if (ichCur >= ichLim) + { + // Missing close quote! + scan.QuotingError = true; + break; + } + + if (span[ichCur] == '"') + { + // Don't include escapeChar in span + _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun - 1)); + ichRun = ichCur; + } + + continue; + } + + if (span[ichCur] == '"') + { + if (ichCur > ichRun) + _sb.AppendSpan(span.Slice(ichRun, ichCur - ichRun)); + + ichCur++; break; - ichRun = ichCur; + } } } diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 67033afde3..84e8b329b9 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -440,6 +440,18 @@ "SortOrder": 150.0, "IsNullable": true, "Default": null + }, + { + "Name": "EscapeChar", + "Type": "Char", + "Desc": "Character to use to escape quotes inside quoted fields. It can't be a character used as separator.", + "Aliases": [ + "escapechar" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "\"" } ] }, diff --git a/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs b/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs index 5026296eab..4632ca5ed6 100644 --- a/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs +++ b/test/Microsoft.ML.Benchmarks/BenchmarkBase.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; using System.IO; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Runtime; @@ -62,4 +63,68 @@ public static string GetBenchmarkDataPathAndEnsureData(string name, string path return filePath; } } + + public class RandomFile + { + public static string CreateRandomFile(string path, int numRows, int numColumns, int maxWordLength) + { + // Create file with random strings + // to use as dataset of the benchmark + + Random random = new Random(1); + + using (StreamWriter file = new StreamWriter(path)) + { + for (int i = 0; i < numRows; i++) + file.WriteLine(CreateRandomLine(numColumns, maxWordLength, random)); + } + return path; + } + + public static string CreateRandomLine(int columns, int maxWordLength, Random random) + { + var lineSB = new System.Text.StringBuilder(); + for (int i = 0; i < columns; i++) + { + lineSB.Append(CreateRandomColumn(random.Next(100), maxWordLength, random)); + lineSB.Append(","); + } + return lineSB.ToString(); + } + + public static string CreateRandomColumn(int numwords, int maxWordLength, Random random) + { + const string characters = + "01234567890" + + "abcdefghijklmnopqrstuvwxyz" + + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + var columnSB = new System.Text.StringBuilder(); + int wordLength; + + bool quoted = false; + if (random.NextDouble() > 0.5) + { + quoted = true; + columnSB.Append('"'); + } + + for (int i = 0; i < numwords; i++) + { + wordLength = random.Next(1, maxWordLength); + for (int j = 0; j < wordLength; j++) + columnSB.Append(characters[random.Next(characters.Length)]); + + columnSB.Append(" "); + } + + if (quoted) + columnSB.Append('"'); + + if (random.Next(2) == 0) // sometimes return the column as lowercase + return columnSB.ToString().ToLower(); + + return columnSB.ToString(); + } + } } diff --git a/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs b/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs index 39aa0d6feb..46496c5100 100644 --- a/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs +++ b/test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs @@ -29,7 +29,7 @@ public void SetupData() _mlContext = new MLContext(seed: 1); var path = Path.GetTempFileName(); Console.WriteLine($"Created dataset in temporary file:\n{path}\n"); - path = CreateRandomFile(path); + path = RandomFile.CreateRandomFile(path, _numRows, _numColumns, _maxWordLength); var columns = new List(); for(int i = 0; i < _numColumns; i++) @@ -41,7 +41,8 @@ public void SetupData() { Columns = columns.ToArray(), HasHeader = false, - Separators = new char[] { ',' } + Separators = new char[] { ',' }, + AllowQuoting = true }); _dataset = textLoader.Load(path); @@ -116,56 +117,5 @@ public ITransformer TrainFeaturizeText() return model; } - - public static string CreateRandomFile(string path) - { - // Create file with random strings - // to use as dataset of the benchmark - - Random random = new Random(1); - - using (StreamWriter file = new StreamWriter(path)) - { - for(int i = 0; i < _numRows; i++) - file.WriteLine(CreateRandomLine(_numColumns, random)); - } - return path; - } - - public static string CreateRandomLine(int columns, Random random) - { - var lineSB = new System.Text.StringBuilder(); - for(int i = 0; i < columns; i++) - { - lineSB.Append(CreateRandomColumn(random, random.Next(100))); - lineSB.Append(","); - } - return lineSB.ToString(); - } - - public static string CreateRandomColumn(Random random, int numwords) - { - const string characters = - "01234567890" + - "abcdefghijklmnopqrstuvwxyz" + - "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - - var columnSB = new System.Text.StringBuilder(); - int wordLength; - - for(int i = 0; i < numwords; i++) - { - wordLength = random.Next(1, _maxWordLength); - for(int j = 0; j < wordLength; j++) - columnSB.Append(characters[random.Next(characters.Length)]); - - columnSB.Append(" "); - } - - if (random.Next(2) == 0) // sometimes return the column as lowercase - return columnSB.ToString().ToLower(); - - return columnSB.ToString(); - } } } diff --git a/test/Microsoft.ML.Benchmarks/TextLoaderBench.cs b/test/Microsoft.ML.Benchmarks/TextLoaderBench.cs new file mode 100644 index 0000000000..8f1faf6b76 --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/TextLoaderBench.cs @@ -0,0 +1,107 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using BenchmarkDotNet.Attributes; +using Microsoft.ML.Transforms.Text; +using Xunit; + +namespace Microsoft.ML.Benchmarks +{ + [Config(typeof(TrainConfig))] + public class TextLoaderBench : BenchmarkBase + { + private MLContext _mlContext; + private IDataView _dataView; + private static int _numColumns = 100; + private static int _numRows = 3000; + private static int _maxWordLength = 15; + private static int _numColumnsToGet = 20; + private List _columns; + + + [GlobalSetup] + public void SetupData() + { + Path.GetTempFileName(); + _mlContext = new MLContext(seed: 1); + var path = Path.GetTempFileName(); + Console.WriteLine($"Created dataset in temporary file:\n{path}\n"); + path = RandomFile.CreateRandomFile(path, _numRows, _numColumns, _maxWordLength); + + _columns = new List(); + for(int i = 0; i < _numColumns; i++) + { + _columns.Add(new TextLoader.Column($"Column{i}", DataKind.String, i)); + } + + var textLoader = _mlContext.Data.CreateTextLoader(new TextLoader.Options() + { + Columns = _columns.ToArray(), + HasHeader = false, + Separators = new char[] { ',' }, + AllowQuoting = true, + ReadMultilines = true, + EscapeChar = '\\', + }); + + _dataView = textLoader.Load(path); + } + + [Benchmark] + public void TestTextLoaderGetters() + { + using(var rowCursor = _dataView.GetRowCursorForAllColumns()) + { + var getters = new List>>(); + for (int i = 0; i < _numColumnsToGet; i++) + { + getters.Add(rowCursor.GetGetter>(_dataView.Schema[i])); + } + + ReadOnlyMemory buff = default; + while (rowCursor.MoveNext()) + { + for (int i = 0; i < _numColumnsToGet; i++) + getters[i](ref buff); + } + } + + //* Summary * + + //BenchmarkDotNet = v0.12.0, OS = Windows 10.0.18363 + //Intel Core i7 - 8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + //.NET Core SDK = 3.1.100 - preview3 - 014645 + // [Host] : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), X64 RyuJIT + // Job - XQBLAM : .NET Core 2.1.13(CoreCLR 4.6.28008.01, CoreFX 4.6.28008.01), X64 RyuJIT + + //Arguments =/ p:Configuration = Release Toolchain = netcoreapp2.1 IterationCount = 1 + //LaunchCount = 3 MaxIterationCount = 20 RunStrategy = ColdStart + //UnrollFactor = 1 WarmupCount = 1 + + //| Method | Mean | Error | StdDev | Extra Metric | + //| ---------------------- | --------: | ---------:| ---------:| -------------:| + //| TestTextLoaderGetters | 1.012 s | 0.6649 s | 0.0364 s | - | + + //// * Legends * + //Mean : Arithmetic mean of all measurements + //Error : Half of 99.9 % confidence interval + // StdDev : Standard deviation of all measurements + // Extra Metric: Value of the provided extra metric + // 1 s: 1 Second(1 sec) + + //// ***** BenchmarkRunner: End ***** + //// ** Remained 0 benchmark(s) to run ** + // Run time: 00:00:16(16.05 sec), executed benchmarks: 1 + + //Global total time: 00:00:33(33.18 sec), executed benchmarks: 1 + + return; + } + } +} diff --git a/test/Microsoft.ML.TestFramework/TestCommandBase.cs b/test/Microsoft.ML.TestFramework/TestCommandBase.cs index 6e5745d932..c1d8c941a2 100644 --- a/test/Microsoft.ML.TestFramework/TestCommandBase.cs +++ b/test/Microsoft.ML.TestFramework/TestCommandBase.cs @@ -2154,8 +2154,8 @@ public void SavePipeChooseColumnsByIndex() [Fact()] public void SavePipeTextLoaderWithMultilines() { - string dataPath = GetDataPath("multiline.csv"); - const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ col=id:Num:0 col=description:TX:1 col=animal:TX:2}"; + string dataPath = GetDataPath("multiline-escapechar.csv"); + const string loaderArgs = "loader=text{sep=, quote+ multilines+ header+ escapechar=\\ col=id:Num:0 col=description:TX:1 col=animal:TX:2}"; OutputPath modelPath = ModelPath(); string extraArgs = null; diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index b2421bacce..4768f2d82c 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -803,6 +803,43 @@ public void TestTextLoaderKeyTypeBackCompat() } } + [Fact] + public void TestTextLoaderBackCompat_VerWritt_0x0001000C() + { + // Checks backward compatibility with a text loader created with "verWrittenCur: 0x0001000C" + // Model generated with: + // loader=text{header+ col=SepalLength:Num:0 col=SepalWidth:Num:1 col=PetalLength:Num:2 col=PetalWidth:Num:2 col=Cat:TX:1-8 col=Num:9-14 col=Type:TX:4} + var mlContext = new MLContext(1); + string textLoaderModelPath = GetDataPath("backcompat/textloader_VerWritt_0x0001000C.zip"); + string irisPath = GetDataPath(TestDatasets.irisData.trainFilename); + + IDataView iris; + using (FileStream modelfs = File.OpenRead(textLoaderModelPath)) + using (var rep = RepositoryReader.Open(modelfs, mlContext)) + { + iris = ModelFileUtils.LoadLoader(mlContext, rep, new MultiFileSource(irisPath), false); + } + + var previewIris = iris.Preview(1); + var irisFirstRow = new Dictionary(); + irisFirstRow["SepalLength"] = 5.1f; + irisFirstRow["SepalWidth"] = 3.5f; + irisFirstRow["PetalLength"] = 1.4f; + irisFirstRow["PetalWidth"] = 0.2f; + + Assert.Equal(5, previewIris.ColumnView.Length); + Assert.Equal("SepalLength", previewIris.Schema[0].Name); + Assert.Equal(NumberDataViewType.Single, previewIris.Schema[0].Type); + int index = 0; + foreach (var entry in irisFirstRow) + { + Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key); + Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value); + } + Assert.Equal("Type", previewIris.RowView[0].Values[index].Key); + Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); + } + private class IrisNoFields { } @@ -939,12 +976,20 @@ public void TestLoadTextWithoutKeyTypeAttribute() } [Theory] - [InlineData(true)] - [InlineData(false)] - public void TestLoadTextWithEscapedNewLines(bool useSaved) + [InlineData(true, false)] + [InlineData(false, false)] + [InlineData(true, true)] + [InlineData(false, true)] + public void TestLoadTextWithEscapedNewLinesAndEscapeChar(bool useSaved, bool useCustomEscapeChar) { var mlContext = new MLContext(seed: 1); - var dataPath = GetDataPath("multiline.csv"); + string dataPath; + + if (!useCustomEscapeChar) + dataPath = GetDataPath("multiline.csv"); + else + dataPath = GetDataPath("multiline-escapechar.csv"); + var baselinePath = GetBaselinePath("TextLoader", "multiline.csv"); var options = new TextLoader.Options() { @@ -952,6 +997,7 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved) Separator = ",", AllowQuoting = true, ReadMultilines = true, + EscapeChar = useCustomEscapeChar ? '\\' : TextLoader.Defaults.EscapeChar, Columns = new[] { new TextLoader.Column("id", DataKind.Int32, 0), @@ -962,16 +1008,23 @@ public void TestLoadTextWithEscapedNewLines(bool useSaved) var data = mlContext.Data.LoadFromTextFile(dataPath, options); if (useSaved) - { + { // Check that loading the data view from a text file, // and then saving that data view to another text file, then loading it again // also matches the baseline. - var savedPath = DeleteOutputPath("saved-multiline.tsv"); + string savedPath; + + if (!useCustomEscapeChar) + savedPath = DeleteOutputPath("multiline-saved.tsv"); + else + savedPath = DeleteOutputPath("multiline-escapechar-saved.tsv"); + using (var fs = File.Create(savedPath)) mlContext.Data.SaveAsText(data, fs, separatorChar: '\t'); options.Separator = "\t"; + options.EscapeChar = '"'; // TextSaver always uses " as escape char data = mlContext.Data.LoadFromTextFile(savedPath, options); } diff --git a/test/data/backcompat/textloader_VerWritt_0x0001000C.zip b/test/data/backcompat/textloader_VerWritt_0x0001000C.zip new file mode 100644 index 0000000000..90ffda9458 Binary files /dev/null and b/test/data/backcompat/textloader_VerWritt_0x0001000C.zip differ diff --git a/test/data/multiline-escapechar.csv b/test/data/multiline-escapechar.csv new file mode 100644 index 0000000000..b9517a52c5 --- /dev/null +++ b/test/data/multiline-escapechar.csv @@ -0,0 +1,34 @@ +// this file should be loaded with quoting and readmultiline enabled +// and it should load without problems by the TextLoader +id,description,animal +// this is a comment that will be ignored +// this is a comment with "quotes" that will also be ignored +// this is a comment with a "quote without close quote that will also be ignored +10,this is a description,dog +11,"this is a quoted description",cat +12,"this is a multiline +quoted description", bird +13,"this has one\"doublequote which should be escaped as a single quote",dog +14,"this has \"doublequotes\" inside of it",cat +15, "this is a multiline +quoted description with + +\"doublequotes\" and + +empty new lines and + + + +escaped quotes inside of \"it\" + +//and this comment with // shouldn't be ignored +since it is part of the \"multiline\"",bird +// this line should be ignored, and the next line is empty: + +16, here is text after the empty line, dog +17, this is a line with an empty animal,"" +"", this is a line with an empty id, bird +19,"",dog +20,we also allow"quotes in the middle of fields,cat +21,or also at the end",bird +22,this is the last row description,dog \ No newline at end of file