From e03b0e4a73094c576f8d5ae8b72065a4d014290a Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Tue, 17 Dec 2019 15:52:22 -0800 Subject: [PATCH 1/3] fix textloader bug on quotes --- .../DataLoadSave/Text/TextLoaderCursor.cs | 4 +-- .../Utilities/StreamUtils.cs | 35 +++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs index 4394b26121..76e310bef6 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs @@ -487,7 +487,7 @@ private void ThreadProc() // REVIEW: Avoid allocating a string for every line. This would probably require // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) // and implementing all the necessary conversion functionality on it. See task 3871. - text = rdr.ReadLine(); + text = rdr.ReadEntry(); if (text == null) goto LNext; line++; @@ -514,7 +514,7 @@ private void ThreadProc() if (_abort) return; - text = rdr.ReadLine(); + text = rdr.ReadEntry(); if (text == null) { // We're done with this file. Queue the last partial batch. diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index f157d09ea8..eba6559938 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -174,5 +174,40 @@ private static string[] Expand(string pattern) return matchList.ToArray(); } #endif + + public static string ReadEntry(this TextReader sr) + { + string strReturn = string.Empty; + + // get first bit + strReturn += sr.ReadLine(); + + // And get more lines until the number of quotes is even + while (strReturn.GetNumberOf("\"").IsOdd()) + { + string strNow = sr.ReadLine(); + strReturn += strNow; + } + + // Then return what we've gotten + if (strReturn == string.Empty) + { + return null; + } + else + { + return strReturn; + } + } + + public static int GetNumberOf(this string s, string strSearchString) + { + return s.Length - s.Replace(strSearchString, string.Empty).Length; + } + + public static bool IsOdd(this int i) + { + return i % 2 != 0; + } } } From 3ddb3b08f8e26accadebb3ef59e2e8eebe6959fc Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Thu, 19 Dec 2019 10:30:17 -0800 Subject: [PATCH 2/3] use static method instead of extension method --- src/Microsoft.ML.Data/Utilities/StreamUtils.cs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index eba6559938..bbc74f6844 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -183,7 +183,7 @@ public static string ReadEntry(this TextReader sr) strReturn += sr.ReadLine(); // And get more lines until the number of quotes is even - while (strReturn.GetNumberOf("\"").IsOdd()) + while (GetNumberOf(strReturn, "\"") % 2 != 0 ) { string strNow = sr.ReadLine(); strReturn += strNow; @@ -200,14 +200,13 @@ public static string ReadEntry(this TextReader sr) } } - public static int GetNumberOf(this string s, string strSearchString) + public static int GetNumberOf(string s, string strSearchString) { - return s.Length - s.Replace(strSearchString, string.Empty).Length; - } - - public static bool IsOdd(this int i) - { - return i % 2 != 0; + if(strSearchString.Length == 0 || s.Length == 0) + { + return 0; + } + return (s.Length - s.Replace(strSearchString, string.Empty).Length) / strSearchString.Length; } } } From 9b20ead4edbcd4e0a2950c0dcde6ae00f2b6ccf3 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Thu, 19 Dec 2019 10:32:10 -0800 Subject: [PATCH 3/3] better name --- src/Microsoft.ML.Data/Utilities/StreamUtils.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs index bbc74f6844..32ee6c774f 100644 --- a/src/Microsoft.ML.Data/Utilities/StreamUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/StreamUtils.cs @@ -177,26 +177,26 @@ private static string[] Expand(string pattern) public static string ReadEntry(this TextReader sr) { - string strReturn = string.Empty; + string entry = string.Empty; // get first bit - strReturn += sr.ReadLine(); + entry += sr.ReadLine(); // And get more lines until the number of quotes is even - while (GetNumberOf(strReturn, "\"") % 2 != 0 ) + while (GetNumberOf(entry, "\"") % 2 != 0 ) { - string strNow = sr.ReadLine(); - strReturn += strNow; + string line = sr.ReadLine(); + entry += line; } // Then return what we've gotten - if (strReturn == string.Empty) + if (entry == string.Empty) { return null; } else { - return strReturn; + return entry; } }