From 15df6ce23b8e6e6938b562dfc82a9fd106f14f2a Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Fri, 15 Nov 2019 14:54:29 -0800 Subject: [PATCH 01/17] Add core hashing functions --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 173 ++++++++++++++ src/Microsoft.ML.Data/Transforms/Hashing.cs | 241 +++++++++++++++++++- 2 files changed, 408 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index 4438293317..84e2f2648f 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -105,6 +105,167 @@ public static uint MurmurRound(uint hash, uint chunk) return hash; } + private static unsafe uint MurmurRoundV2(uint hash, byte* key, int len) + { + int nblocks = len / 4; + byte* data = key; + uint* blocks = (uint*)(data + nblocks * 4); + + for (int i = -nblocks; i!=0; i++) + { + uint chunk = blocks[i]; + chunk *= 0xCC9E2D51; + chunk = Rotate(chunk, 15); + chunk *= 0x1B873593; + + hash ^= chunk; + hash = Rotate(hash, 13); + hash *= 5; + hash += 0xE6546B64; + } + + byte* tail = (byte*)(data + nblocks * 4); + + uint k1 = 0; + + switch (len & 3) + { + case 3: + k1 ^= (uint)tail[2] << 16; + goto case 2; + case 2: + k1 ^= (uint)tail[1] << 8; + goto case 1; + case 1: + k1 ^= tail[0]; + k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15); + k1 *= 0x1B873593; + hash ^= k1; + break; + } + + return hash; + } + + public static uint MurmurRoundFloat(uint hash, float chunk) + { + unsafe + { + float* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 4); + } + } + + public static uint MurmurRoundDouble(uint hash, double chunk) + { + unsafe + { + double* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 4); + } + } + + public static uint MurmurRoundText(uint hash, string chunk) + { + unsafe + { + byte[] utf16Bytes = Encoding.Unicode.GetBytes(chunk); + byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes); + fixed (byte* key = utf8Bytes) + return MurmurRoundV2(hash, key, chunk.Length); + } + } + + public static uint MurmurRoundU1(uint hash, byte chunk) + { + unsafe + { + byte* key = &chunk; + return MurmurRoundV2(hash, key, 1); + } + } + + public static uint MurmurRoundU2(uint hash, ushort chunk) + { + unsafe + { + ushort* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 2); + } + } + + public static uint MurmurRoundU4(uint hash, uint chunk) + { + unsafe + { + uint* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 4); + } + } + + public static uint MurmurRoundU8(uint hash, ulong chunk) + { + unsafe + { + ulong* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 8); + } + } + + public static uint MurmurRoundI1(uint hash, sbyte chunk) + { + unsafe + { + sbyte* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 1); + } + } + + public static uint MurmurRoundI2(uint hash, short chunk) + { + unsafe + { + short* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 1); + } + } + + public static uint MurmurRoundI4(uint hash, int chunk) + { + unsafe + { + int* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 1); + } + } + + public static uint MurmurRoundI8(uint hash, long chunk) + { + unsafe + { + long* keys = &chunk; + byte* key; + key = (byte*)keys; + return MurmurRoundV2(hash, key, 1); + } + } + /// /// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding. /// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th. @@ -284,6 +445,18 @@ public static uint MixHash(uint hash) return hash; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint MixHashV2(uint hash, int len) + { + hash ^= (uint)len; + hash ^= hash >> 16; + hash *= 0x85ebca6b; + hash ^= hash >> 13; + hash *= 0xc2b2ae35; + hash ^= hash >> 16; + return hash; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static uint Rotate(uint x, int r) { diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 6919f865ae..a186ac8496 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -11,6 +11,7 @@ using Microsoft.ML.CommandLine; using Microsoft.ML.Data; using Microsoft.ML.Internal.Utilities; +using Microsoft.ML.Model.OnnxConverter; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; @@ -122,8 +123,9 @@ private static VersionInfo GetVersionInfo() return new VersionInfo( modelSignature: "HASHTRNS", // verWrittenCur: 0x00010001, // Initial - verWrittenCur: 0x00010002, // Invert hash key values, hash fix - verReadableCur: 0x00010002, + //verWrittenCur: 0x00010002, // Invert hash key values, hash fix + verWrittenCur: 0x00010003, + verReadableCur: 0x00010003, verWeCanReadBack: 0x00010002, loaderSignature: LoaderSignature, loaderAssemblyName: typeof(HashingTransformer).Assembly.FullName); @@ -245,9 +247,15 @@ private Delegate GetGetterCore(DataViewRow input, int iinfo, out Action disposer disposer = null; input.Schema.TryGetColumnIndex(_columns[iinfo].InputColumnName, out int srcCol); var srcType = input.Schema[srcCol].Type; - if (!(srcType is VectorDataViewType vectorType)) - return ComposeGetterOne(input, iinfo, srcCol, srcType); - return ComposeGetterVec(input, iinfo, srcCol, vectorType); + if (GetVersionInfo().VerWrittenCur == 0x00010002) + { + if (!(srcType is VectorDataViewType vectorType)) + return ComposeGetterOne(input, iinfo, srcCol, srcType); + return ComposeGetterVec(input, iinfo, srcCol, vectorType); + } + if (!(srcType is VectorDataViewType vectorType2)) + return ComposeGetterOneV2(input, iinfo, srcCol, srcType); + return ComposeGetterVec(input, iinfo, srcCol, vectorType2); } private protected override IRowMapper MakeRowMapper(DataViewSchema schema) => new Mapper(this, schema); @@ -378,6 +386,58 @@ private ValueGetter ComposeGetterOne(DataViewRow input, int iinfo, int src return MakeScalarHashGetter(input, srcCol, seed, mask); } + private ValueGetter ComposeGetterOneV2(DataViewRow input, int iinfo, int srcCol, DataViewType srcType) + { + Host.Assert(HashingEstimator.IsColumnTypeValid(srcType)); + + var mask = (1U << _columns[iinfo].NumberOfBits) - 1; + uint seed = _columns[iinfo].Seed; + // In case of single valued input column, hash in 0 for the slot index. + if (_columns[iinfo].UseOrderedHashing) + seed = Hashing.MurmurRound(seed, 0); + + if (srcType is KeyDataViewType) + { + if (srcType.RawType == typeof(uint)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(ulong)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(ushort)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + + Host.Assert(srcType.RawType == typeof(byte)); + return MakeScalarHashGetter(input, srcCol, seed, mask); + } + + if (srcType.RawType == typeof(ReadOnlyMemory)) + return MakeScalarHashGetter, HashTextV2>(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(float)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(double)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(sbyte)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(short)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(int)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(long)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(byte)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(ushort)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(uint)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(ulong)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + else if (srcType.RawType == typeof(DataViewRowId)) + return MakeScalarHashGetter(input, srcCol, seed, mask); + + Host.Assert(srcType.RawType == typeof(bool)); + return MakeScalarHashGetter(input, srcCol, seed, mask); + } + private ValueGetter> ComposeGetterVec(DataViewRow input, int iinfo, int srcCol, VectorDataViewType srcType) { Host.Assert(HashingEstimator.IsColumnTypeValid(srcType.ItemType)); @@ -472,6 +532,13 @@ public uint HashCore(uint seed, uint mask, in float value) => float.IsNaN(value) ? 0 : (Hashing.MixHash(Hashing.MurmurRound(seed, FloatUtils.GetBits(value == 0 ? 0 : value))) & mask) + 1; } + private readonly struct HashFloatV2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in float value) + => float.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundFloat(seed, value), 4)); + } + private readonly struct HashDouble : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -490,6 +557,13 @@ public uint HashCore(uint seed, uint mask, in double value) } } + private readonly struct HashDoubleV2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in double value) + => double.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundDouble(seed, value), 8)); + } + private readonly struct HashText : IHasher> { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -497,6 +571,17 @@ public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) => value.IsEmpty ? 0 : (Hashing.MurmurHash(seed, value.Span.Trim(' ')) & mask) + 1; } + private readonly struct HashTextV2 : IHasher> + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) + { + var len = value.Length; + string valueString = value.ToString(); + return Hashing.MixHashV2(Hashing.MurmurRoundText(seed, valueString), len); + } + } + private readonly struct HashKey1 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -504,6 +589,13 @@ public uint HashCore(uint seed, uint mask, in byte value) => value == 0 ? 0 : (Hashing.MixHash(Hashing.MurmurRound(seed, value)) & mask) + 1; } + private readonly struct HashKey1V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in byte value) + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU1(seed, value), 1) & mask) + 1; + } + private readonly struct HashKey2 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -511,6 +603,13 @@ public uint HashCore(uint seed, uint mask, in ushort value) => value == 0 ? 0 : (Hashing.MixHash(Hashing.MurmurRound(seed, value)) & mask) + 1; } + private readonly struct HashKey2V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in ushort value) + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU2(seed, value), 2) & mask) + 1; + } + private readonly struct HashKey4 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -518,6 +617,13 @@ public uint HashCore(uint seed, uint mask, in uint value) => value == 0 ? 0 : (Hashing.MixHash(Hashing.MurmurRound(seed, value)) & mask) + 1; } + private readonly struct HashKey4V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in uint value) + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU4(seed, value), 4) & mask) + 1; + } + private readonly struct HashKey8 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -533,6 +639,13 @@ public uint HashCore(uint seed, uint mask, in ulong value) } } + private readonly struct HashKey8V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in ulong value) + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU8(seed, value), 8) & mask) + 1; + } + private readonly struct HashU1 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -540,6 +653,13 @@ public uint HashCore(uint seed, uint mask, in byte value) => (Hashing.MixHash(Hashing.MurmurRound(seed, value)) & mask) + 1; } + private readonly struct HashU1V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in byte value) + => (Hashing.MixHashV2(Hashing.MurmurRoundU1(seed, value), 1) & mask); + } + private readonly struct HashU2 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -547,6 +667,13 @@ public uint HashCore(uint seed, uint mask, in ushort value) => (Hashing.MixHash(Hashing.MurmurRound(seed, value)) & mask) + 1; } + private readonly struct HashU2V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in ushort value) + => (Hashing.MixHashV2(Hashing.MurmurRoundU2(seed, value), 2) & mask); + } + private readonly struct HashU4 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -554,6 +681,13 @@ public uint HashCore(uint seed, uint mask, in uint value) => (Hashing.MixHash(Hashing.MurmurRound(seed, value)) & mask) + 1; } + private readonly struct HashU4V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in uint value) + => (Hashing.MixHashV2(Hashing.MurmurRoundU4(seed, value), 4)); //&mask + } + private readonly struct HashU8 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -567,6 +701,13 @@ public uint HashCore(uint seed, uint mask, in ulong value) } } + private readonly struct HashU8V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in ulong value) + => (Hashing.MixHashV2(Hashing.MurmurRoundU8(seed, value), 8) & mask); //&mask + } + private readonly struct HashU16 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -594,6 +735,13 @@ public uint HashCore(uint seed, uint mask, in bool value) => (Hashing.MixHash(Hashing.MurmurRound(seed, value ? 1u : 0u)) & mask) + 1; } + private readonly struct HashBoolV2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in bool value) + => (Hashing.MixHashV2(Hashing.MurmurRoundU1(seed, value ? (byte)1 : (byte)0), 1) & mask); + } + private readonly struct HashI1 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -601,6 +749,13 @@ public uint HashCore(uint seed, uint mask, in sbyte value) => (Hashing.MixHash(Hashing.MurmurRound(seed, (uint)value)) & mask) + 1; } + private readonly struct HashI1V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in sbyte value) + => (Hashing.MixHashV2(Hashing.MurmurRoundI1(seed, value), 1) & mask); + } + private readonly struct HashI2 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -608,6 +763,13 @@ public uint HashCore(uint seed, uint mask, in short value) => (Hashing.MixHash(Hashing.MurmurRound(seed, (uint)value)) & mask) + 1; } + private readonly struct HashI2V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in short value) + => (Hashing.MixHashV2(Hashing.MurmurRoundI2(seed, value), 2) & mask); + } + private readonly struct HashI4 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -615,6 +777,13 @@ public uint HashCore(uint seed, uint mask, in int value) => (Hashing.MixHash(Hashing.MurmurRound(seed, (uint)value)) & mask) + 1; } + private readonly struct HashI4V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in int value) + => (Hashing.MixHashV2(Hashing.MurmurRoundI4(seed, value), 4) & mask); + } + private readonly struct HashI8 : IHasher { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -628,6 +797,13 @@ public uint HashCore(uint seed, uint mask, in long value) } } + private readonly struct HashI8V2 : IHasher + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public uint HashCore(uint seed, uint mask, in long value) + => (Hashing.MixHashV2(Hashing.MurmurRoundI8(seed, value), 8) & mask); + } + private static ValueGetter MakeScalarHashGetter(DataViewRow input, int srcCol, uint seed, uint mask) where THash : struct, IHasher { @@ -780,7 +956,7 @@ private static ValueGetter> MakeVectorOrderedHashGetter( }; } - private sealed class Mapper : OneToOneMapperBase + private sealed class Mapper : OneToOneMapperBase, ISaveAsOnnx { private sealed class ColInfo { @@ -834,6 +1010,59 @@ private void AddMetaKeyValues(int i, DataViewSchema.Annotations.Builder builder) } protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func activeOutput, out Action disposer) => _parent.GetGetterCore(input, iinfo, out disposer); + + private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariable, string dstVariable) + { + string opType; +/* + opType = "Cast"; + string castOutput = ctx.AddIntermediateVariable(_types[iinfo], "CastOutput", true); + var castNode = ctx.CreateNode(opType, srcVariable, castOutput, ctx.GetNodeName(opType), ""); + var t = _types[iinfo].RawType;//OutputKind.ToInternalDataKind().ToType(); + castNode.AddAttribute("to", t);*/ + + opType = "MurmurHash3"; + string murmurOutput = ctx.AddIntermediateVariable(_types[iinfo], "MurmurOutput", true); + var murmurNode = ctx.CreateNode(opType, srcVariable, dstVariable, ctx.GetNodeName(opType), "com.microsoft"); + murmurNode.AddAttribute("positive", 1); + var seed = _parent._columns[iinfo].Seed; + murmurNode.AddAttribute("seed", seed); + + /* + opType = "And"; + var mask = (1U << _parent._columns[iinfo].NumberOfBits) - 1; + string m = ctx.AddInitializer(mask); + string andOutput = ctx.AddIntermediateVariable(_types[iinfo], "andOutput", true); + var andNode = ctx.CreateNode(opType, new[] { murmurOutput, m}, new[] { andOutput}, ctx.GetNodeName(opType), ""); + + opType = "Add"; + string one = ctx.AddInitializer(1); + var addNode = ctx.CreateNode(opType, new[] { andOutput, one }, new[] { dstVariable }, ctx.GetNodeName(opType), "");*/ + + return true; + } + + void ISaveAsOnnx.SaveAsOnnx(OnnxContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + for (int iinfo = 0; iinfo < _parent._columns.Length; ++iinfo) + { + var colName = _parent._columns[iinfo].Name; + string inputColumnName = InputSchema[colName].Name; + if (!ctx.ContainsColumn(inputColumnName)) + { + ctx.RemoveColumn(inputColumnName, false); + continue; + } + + if (!SaveAsOnnxCore(ctx, iinfo, ctx.GetVariableName(inputColumnName), ctx.AddIntermediateVariable(_types[iinfo], inputColumnName))) + { + ctx.RemoveColumn(inputColumnName, true); + } + } + } + + bool ICanSaveOnnx.CanSaveOnnx(OnnxContext ctx) => true; } private abstract class InvertHashHelper From 8a2992738166e42a61b7e53d3f01b86585d42279 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Fri, 15 Nov 2019 14:54:57 -0800 Subject: [PATCH 02/17] add test --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 5e08a64fb2..93bd652319 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -997,6 +997,56 @@ from weighting in weightingCriteria Done(); } + private class HashData + { + public ReadOnlyMemory Education { get; set; } + } + + [Fact] + public void MurmurHashTest() + { + var mlContext = new MLContext(); + + var samples = new[] + { + new HashData {Education = "alibaba".AsMemory()}, + //new DataP {Education = "0-5yrs"}, + //new DataP {Education = "6-11yrs"}, + //new DataP {Education = "6-11yrs"}, + //new DataP {Education = "11-15yrs"} + }; + + IDataView data = mlContext.Data.LoadFromEnumerable(samples); + //ta.GetRowCursor.Schema[0].Type = KeyDataViewType; + + var hashEstimator = new HashingEstimator(Env, "Education"); + //var modelPath = "MurmurHashModel.zip"; + var model = hashEstimator.Fit(data); + //mlContext.Model.Save(model, data.Schema, modelPath); + var hashTransformedData = model.Transform(data); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + var onnxFileName = "MurmurHashV2.onnx"; + var onnxTextName = "MurmurHashV2.txt"; + var onnxModelPath = GetOutputPath(onnxFileName); + var onnxTextPath = GetOutputPath(onnxTextName); + + SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + CompareSelectedScalarColumns("Education", outputNames[0], hashTransformedData, onnxResult); + } + + Done(); + } + private void CreateDummyExamplesToMakeComplierHappy() { var dummyExample = new BreastCancerFeatureVector() { Features = null }; @@ -1035,6 +1085,29 @@ private void CompareResults(string leftColumnName, string rightColumnName, IData CompareSelectedVectorColumns(leftColumnName, rightColumnName, left, right); } + private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + T expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + + Assert.Equal(1, actual.Length); + Assert.Equal(expected, actual.GetItemOrDefault(0)); + } + } + } + private void CompareSelectedVectorColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) { var leftColumn = left.Schema[leftColumnName]; From f8c6ab6ff855e3860e6068f295f0f549ce000150 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Mon, 18 Nov 2019 10:54:11 -0800 Subject: [PATCH 03/17] Don't convert to string in HashText --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 5 ++--- src/Microsoft.ML.Data/Transforms/Hashing.cs | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index 84e2f2648f..50cac69b06 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -169,12 +169,11 @@ public static uint MurmurRoundDouble(uint hash, double chunk) } } - public static uint MurmurRoundText(uint hash, string chunk) + public static uint MurmurRoundText(uint hash, ReadOnlyMemory chunk) { unsafe { - byte[] utf16Bytes = Encoding.Unicode.GetBytes(chunk); - byte[] utf8Bytes = Encoding.Convert(Encoding.Unicode, Encoding.UTF8, utf16Bytes); + byte[] utf8Bytes = Encoding.UTF8.GetBytes(chunk.ToArray()); fixed (byte* key = utf8Bytes) return MurmurRoundV2(hash, key, chunk.Length); } diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index a186ac8496..53b4e6ccdc 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -577,8 +577,8 @@ public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) { var len = value.Length; - string valueString = value.ToString(); - return Hashing.MixHashV2(Hashing.MurmurRoundText(seed, valueString), len); + //string valueString = value.ToString(); + return Hashing.MixHashV2(Hashing.MurmurRoundText(seed, value), len); } } From d17eeb3c812c709077f1c6b6ec3e846fdb27b871 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Mon, 18 Nov 2019 10:57:00 -0800 Subject: [PATCH 04/17] add versioning for vectro types --- src/Microsoft.ML.Data/Transforms/Hashing.cs | 49 ++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 53b4e6ccdc..6074b07d29 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -255,7 +255,7 @@ private Delegate GetGetterCore(DataViewRow input, int iinfo, out Action disposer } if (!(srcType is VectorDataViewType vectorType2)) return ComposeGetterOneV2(input, iinfo, srcCol, srcType); - return ComposeGetterVec(input, iinfo, srcCol, vectorType2); + return ComposeGetterVecV2(input, iinfo, srcCol, vectorType2); } private protected override IRowMapper MakeRowMapper(DataViewSchema schema) => new Mapper(this, schema); @@ -485,6 +485,53 @@ private ValueGetter> ComposeGetterVec(DataViewRow input, int iinfo return ComposeGetterVecCore, HashText>(input, iinfo, srcCol, srcType); } + private ValueGetter> ComposeGetterVecV2(DataViewRow input, int iinfo, int srcCol, VectorDataViewType srcType) + { + Host.Assert(HashingEstimator.IsColumnTypeValid(srcType.ItemType)); + + Type rawType = srcType.ItemType.RawType; + if (srcType.ItemType is KeyDataViewType) + { + if (rawType == typeof(byte)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(ushort)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(uint)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + + Host.Assert(rawType == typeof(ulong)); + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + } + + if (rawType == typeof(byte)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(ushort)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(uint)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(ulong)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(DataViewRowId)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(sbyte)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(short)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(int)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(long)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(float)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(double)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + else if (rawType == typeof(bool)) + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + + Host.Assert(srcType.ItemType == TextDataViewType.Instance); + return ComposeGetterVecCore, HashTextV2>(input, iinfo, srcCol, srcType); + } + private ValueGetter> ComposeGetterVecCore(DataViewRow input, int iinfo, int srcCol, VectorDataViewType srcType) where THash : struct, IHasher { From 48a3585987495d62645f0447659c091001242a20 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Wed, 20 Nov 2019 14:49:09 -0800 Subject: [PATCH 05/17] Use span instead of pointers --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 117 +++++++-------------- 1 file changed, 37 insertions(+), 80 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index 50cac69b06..9f6c5c9620 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -105,15 +105,14 @@ public static uint MurmurRound(uint hash, uint chunk) return hash; } - private static unsafe uint MurmurRoundV2(uint hash, byte* key, int len) + private static uint MurmurRoundSpanV2(uint hash, Span key, int len) { int nblocks = len / 4; - byte* data = key; - uint* blocks = (uint*)(data + nblocks * 4); + byte[] data = key.ToArray(); - for (int i = -nblocks; i!=0; i++) + for (int i = nblocks; i >0; i--) { - uint chunk = blocks[i]; + uint chunk = BitConverter.ToUInt32(data, (nblocks * 4 - i*4)); chunk *= 0xCC9E2D51; chunk = Rotate(chunk, 15); chunk *= 0x1B873593; @@ -124,7 +123,7 @@ private static unsafe uint MurmurRoundV2(uint hash, byte* key, int len) hash += 0xE6546B64; } - byte* tail = (byte*)(data + nblocks * 4); + byte[] tail = new byte[3]{data[len-3], data[len-2], data[len-1]}; uint k1 = 0; @@ -149,120 +148,78 @@ private static unsafe uint MurmurRoundV2(uint hash, byte* key, int len) public static uint MurmurRoundFloat(uint hash, float chunk) { - unsafe - { - float* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 4); - } + var floatBytes = BitConverter.GetBytes(chunk); + Span key = new Span(floatBytes); + return MurmurRoundSpanV2(hash, key, 4); } public static uint MurmurRoundDouble(uint hash, double chunk) { - unsafe - { - double* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 4); - } + var doubleBytes = BitConverter.GetBytes(chunk); + Span key = new Span(doubleBytes); + return MurmurRoundSpanV2(hash, key, 8); } public static uint MurmurRoundText(uint hash, ReadOnlyMemory chunk) { - unsafe - { - byte[] utf8Bytes = Encoding.UTF8.GetBytes(chunk.ToArray()); - fixed (byte* key = utf8Bytes) - return MurmurRoundV2(hash, key, chunk.Length); - } + byte[] utf8Bytes = Encoding.UTF8.GetBytes(chunk.ToArray()); + var key = new Span(utf8Bytes); + return MurmurRoundSpanV2(hash, key, chunk.Length); } public static uint MurmurRoundU1(uint hash, byte chunk) { - unsafe - { - byte* key = &chunk; - return MurmurRoundV2(hash, key, 1); - } + Span key = new Span(new byte[] { chunk }); + return MurmurRoundSpanV2(hash, key, 1); } public static uint MurmurRoundU2(uint hash, ushort chunk) { - unsafe - { - ushort* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 2); - } + var ushortBytes = BitConverter.GetBytes(chunk); + Span key = new Span(ushortBytes); + return MurmurRoundSpanV2(hash, key, 2); } public static uint MurmurRoundU4(uint hash, uint chunk) { - unsafe - { - uint* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 4); - } + var uintBytes = BitConverter.GetBytes(chunk); + Span key = new Span(uintBytes); + return MurmurRoundSpanV2(hash, key, 4); } public static uint MurmurRoundU8(uint hash, ulong chunk) { - unsafe - { - ulong* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 8); - } + var ulongBytes = BitConverter.GetBytes(chunk); + Span key = new Span(ulongBytes); + return MurmurRoundSpanV2(hash, key, 8); } public static uint MurmurRoundI1(uint hash, sbyte chunk) { - unsafe - { - sbyte* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 1); - } + var sbyteBytes = BitConverter.GetBytes(chunk); + Span key = new Span(sbyteBytes); + return MurmurRoundSpanV2(hash, key, 1); } public static uint MurmurRoundI2(uint hash, short chunk) { - unsafe - { - short* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 1); - } + var shortBytes = BitConverter.GetBytes(chunk); + Span key = new Span(shortBytes); + return MurmurRoundSpanV2(hash, key, 2); } public static uint MurmurRoundI4(uint hash, int chunk) { - unsafe - { - int* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 1); - } + var intBytes = BitConverter.GetBytes(chunk); + Span key = new Span(intBytes); + return MurmurRoundSpanV2(hash, key, 4); } public static uint MurmurRoundI8(uint hash, long chunk) { - unsafe - { - long* keys = &chunk; - byte* key; - key = (byte*)keys; - return MurmurRoundV2(hash, key, 1); - } + var longBytes = BitConverter.GetBytes(chunk); + Span key = new Span(longBytes); + return MurmurRoundSpanV2(hash, key, 8); } /// From 11f5de7722da2ea2318dfd9dff78e296af802a19 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Thu, 21 Nov 2019 10:51:03 -0800 Subject: [PATCH 06/17] Add perf test --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index b941c9ac74..a7e02f10fe 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -307,6 +307,41 @@ public void KeyToVectorWithBagOnnxConversionTest() Done(); } + private static readonly Random random = new Random(); + public static string RandomString(int length) + { + const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + return new string(Enumerable.Repeat(chars, length) + .Select(s => s[random.Next(s.Length)]).ToArray()); + } + + public class HashData2 + { + public bool Label { get; set; } + public string Features { get; set; } + } + + [Fact] + public void KeyToVectorWithBagHashConversionTest() + { + var mlContext = new MLContext(seed: 1); + int n = 100000000; + var samples = new List(); + for (int i = 0; i < n; i++) + { + samples.Add(new HashData2 { Label = true, Features = RandomString(5) }); + } + var data = mlContext.Data.LoadFromEnumerable(samples); + + var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("Features", null, OneHotEncodingEstimator.OutputKind.Bag) + .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); + + var model = pipeline.Fit(data); + //var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numberOfFolds: 5); + + Done(); + } + [Fact] public void InitializerCreationTest() { From f2d787e01c40e80419e3f4cb3480268992758e03 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Fri, 22 Nov 2019 10:59:55 -0800 Subject: [PATCH 07/17] add perf for num types --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index a7e02f10fe..14bbd58206 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -321,6 +321,12 @@ public class HashData2 public string Features { get; set; } } + public class HashDataNum + { + public bool Label { get; set; } + public uint Features { get; set; } + } + [Fact] public void KeyToVectorWithBagHashConversionTest() { @@ -342,6 +348,27 @@ public void KeyToVectorWithBagHashConversionTest() Done(); } + [Fact] + public void KeyToVectorWithBagHashNumConversionTest() + { + var mlContext = new MLContext(seed: 1); + int n = 100000000; + var samples = new List(); + for (int i = 0; i < n; i++) + { + samples.Add(new HashDataNum { Label = true, Features = (uint)i }); + } + var data = mlContext.Data.LoadFromEnumerable(samples); + + var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("Features", null, OneHotEncodingEstimator.OutputKind.Bag) + .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); + + var model = pipeline.Fit(data); + //var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numberOfFolds: 5); + + Done(); + } + [Fact] public void InitializerCreationTest() { From 99ec4fdce9fae76c679cf0c99dab46fb00ca71a7 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Tue, 3 Dec 2019 11:07:19 -0800 Subject: [PATCH 08/17] update hashing algorithm --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 13 ++++++------- src/Microsoft.ML.Data/Transforms/Hashing.cs | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index 9f6c5c9620..bdea58ffe6 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Buffers.Binary; using System.Runtime.CompilerServices; using System.Text; using Microsoft.ML.Runtime; @@ -108,11 +109,11 @@ public static uint MurmurRound(uint hash, uint chunk) private static uint MurmurRoundSpanV2(uint hash, Span key, int len) { int nblocks = len / 4; - byte[] data = key.ToArray(); + var data = key; for (int i = nblocks; i >0; i--) { - uint chunk = BitConverter.ToUInt32(data, (nblocks * 4 - i*4)); + uint chunk = BinaryPrimitives.ReadUInt32LittleEndian(data.Slice(nblocks * 4 - i*4, 4)); chunk *= 0xCC9E2D51; chunk = Rotate(chunk, 15); chunk *= 0x1B873593; @@ -123,20 +124,18 @@ private static uint MurmurRoundSpanV2(uint hash, Span key, int len) hash += 0xE6546B64; } - byte[] tail = new byte[3]{data[len-3], data[len-2], data[len-1]}; - uint k1 = 0; switch (len & 3) { case 3: - k1 ^= (uint)tail[2] << 16; + k1 ^= (uint)data[len-1] << 16; goto case 2; case 2: - k1 ^= (uint)tail[1] << 8; + k1 ^= (uint)data[len-2] << 8; goto case 1; case 1: - k1 ^= tail[0]; + k1 ^= data[len-3]; k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15); k1 *= 0x1B873593; hash ^= k1; diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 6074b07d29..13e0498adc 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -625,7 +625,7 @@ public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) { var len = value.Length; //string valueString = value.ToString(); - return Hashing.MixHashV2(Hashing.MurmurRoundText(seed, value), len); + return (Hashing.MixHashV2(Hashing.MurmurRoundText(seed, value), len) & mask); } } From 33ad3aefbf87a753418637c3f17ef53a0e2f473d Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Tue, 3 Dec 2019 16:53:37 -0800 Subject: [PATCH 09/17] initial cleanup --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 45 ++++---------- src/Microsoft.ML.Data/Transforms/Hashing.cs | 66 ++++++++++++++------- 2 files changed, 57 insertions(+), 54 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index bdea58ffe6..65c0ee789c 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -106,45 +106,22 @@ public static uint MurmurRound(uint hash, uint chunk) return hash; } - private static uint MurmurRoundSpanV2(uint hash, Span key, int len) + public static uint MurmurRoundV2(uint hash, uint key) { - int nblocks = len / 4; - var data = key; + var chunk = key; - for (int i = nblocks; i >0; i--) - { - uint chunk = BinaryPrimitives.ReadUInt32LittleEndian(data.Slice(nblocks * 4 - i*4, 4)); - chunk *= 0xCC9E2D51; - chunk = Rotate(chunk, 15); - chunk *= 0x1B873593; - - hash ^= chunk; - hash = Rotate(hash, 13); - hash *= 5; - hash += 0xE6546B64; - } - - uint k1 = 0; + chunk *= 0xCC9E2D51; + chunk = Rotate(chunk, 15); + chunk *= 0x1B873593; - switch (len & 3) - { - case 3: - k1 ^= (uint)data[len-1] << 16; - goto case 2; - case 2: - k1 ^= (uint)data[len-2] << 8; - goto case 1; - case 1: - k1 ^= data[len-3]; - k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15); - k1 *= 0x1B873593; - hash ^= k1; - break; - } + hash ^= chunk; + hash = Rotate(hash, 13); + hash *= 5; + hash += 0xE6546B64; return hash; } - + /* public static uint MurmurRoundFloat(uint hash, float chunk) { var floatBytes = BitConverter.GetBytes(chunk); @@ -220,7 +197,7 @@ public static uint MurmurRoundI8(uint hash, long chunk) Span key = new Span(longBytes); return MurmurRoundSpanV2(hash, key, 8); } - + */ /// /// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding. /// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th. diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 13e0498adc..e1e16493e2 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -583,7 +583,7 @@ public uint HashCore(uint seed, uint mask, in float value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in float value) - => float.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundFloat(seed, value), 4)); + => float.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, FloatUtils.GetBits(value == 0 ? 0 : value)), 4) & mask); } private readonly struct HashDouble : IHasher @@ -608,7 +608,17 @@ public uint HashCore(uint seed, uint mask, in double value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in double value) - => double.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundDouble(seed, value), 8)); + { + if (double.IsNaN(value)) + return 0; + + ulong v = FloatUtils.GetBits(value == 0 ? 0 : value); + var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo(v)); + var hi = Utils.GetHi(v); + if (hi != 0) + hash = Hashing.MurmurRoundV2(hash, hi); + return (Hashing.MixHashV2(hash, 8) & mask); + } } private readonly struct HashText : IHasher> @@ -622,11 +632,7 @@ public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) - { - var len = value.Length; - //string valueString = value.ToString(); - return (Hashing.MixHashV2(Hashing.MurmurRoundText(seed, value), len) & mask); - } + => value.IsEmpty ? 0 : (Hashing.MurmurHash(seed, value.Span.Trim(' ')) & mask) + 1; } private readonly struct HashKey1 : IHasher @@ -640,7 +646,7 @@ public uint HashCore(uint seed, uint mask, in byte value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in byte value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU1(seed, value), 1) & mask) + 1; + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 1) & mask) + 1; } private readonly struct HashKey2 : IHasher @@ -654,7 +660,7 @@ public uint HashCore(uint seed, uint mask, in ushort value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ushort value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU2(seed, value), 2) & mask) + 1; + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask) + 1; } private readonly struct HashKey4 : IHasher @@ -668,7 +674,7 @@ public uint HashCore(uint seed, uint mask, in uint value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in uint value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU4(seed, value), 4) & mask) + 1; + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask) + 1; } private readonly struct HashKey8 : IHasher @@ -690,7 +696,15 @@ public uint HashCore(uint seed, uint mask, in ulong value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ulong value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundU8(seed, value), 8) & mask) + 1; + { + if (value == 0) + return 0; + var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo(value)); + var hi = Utils.GetHi(value); + if (hi != 0) + hash = Hashing.MurmurRoundV2(hash, hi); + return (Hashing.MixHashV2(hash, 4) & mask) + 1; + } } private readonly struct HashU1 : IHasher @@ -704,7 +718,7 @@ public uint HashCore(uint seed, uint mask, in byte value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in byte value) - => (Hashing.MixHashV2(Hashing.MurmurRoundU1(seed, value), 1) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask); } private readonly struct HashU2 : IHasher @@ -718,7 +732,7 @@ public uint HashCore(uint seed, uint mask, in ushort value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ushort value) - => (Hashing.MixHashV2(Hashing.MurmurRoundU2(seed, value), 2) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask); } private readonly struct HashU4 : IHasher @@ -732,7 +746,7 @@ public uint HashCore(uint seed, uint mask, in uint value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in uint value) - => (Hashing.MixHashV2(Hashing.MurmurRoundU4(seed, value), 4)); //&mask + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4)) & mask; } private readonly struct HashU8 : IHasher @@ -752,7 +766,13 @@ public uint HashCore(uint seed, uint mask, in ulong value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ulong value) - => (Hashing.MixHashV2(Hashing.MurmurRoundU8(seed, value), 8) & mask); //&mask + { + var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo(value)); + var hi = Utils.GetHi(value); + if (hi != 0) + hash = Hashing.MurmurRoundV2(hash, hi); + return (Hashing.MixHashV2(hash, 4) & mask) + 1; + } } private readonly struct HashU16 : IHasher @@ -786,7 +806,7 @@ public uint HashCore(uint seed, uint mask, in bool value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in bool value) - => (Hashing.MixHashV2(Hashing.MurmurRoundU1(seed, value ? (byte)1 : (byte)0), 1) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value ? 1u : 0u), 4) & mask); } private readonly struct HashI1 : IHasher @@ -800,7 +820,7 @@ public uint HashCore(uint seed, uint mask, in sbyte value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in sbyte value) - => (Hashing.MixHashV2(Hashing.MurmurRoundI1(seed, value), 1) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, (uint)value), 4) & mask); } private readonly struct HashI2 : IHasher @@ -814,7 +834,7 @@ public uint HashCore(uint seed, uint mask, in short value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in short value) - => (Hashing.MixHashV2(Hashing.MurmurRoundI2(seed, value), 2) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, (uint)value), 4) & mask); } private readonly struct HashI4 : IHasher @@ -828,7 +848,7 @@ public uint HashCore(uint seed, uint mask, in int value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in int value) - => (Hashing.MixHashV2(Hashing.MurmurRoundI4(seed, value), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, (uint)value), 4) & mask); } private readonly struct HashI8 : IHasher @@ -848,7 +868,13 @@ public uint HashCore(uint seed, uint mask, in long value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in long value) - => (Hashing.MixHashV2(Hashing.MurmurRoundI8(seed, value), 8) & mask); + { + var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo((ulong)value)); + var hi = Utils.GetHi((ulong)value); + if (hi != 0) + hash = Hashing.MurmurRoundV2(hash, hi); + return (Hashing.MixHashV2(hash, 4) & mask); + } } private static ValueGetter MakeScalarHashGetter(DataViewRow input, int srcCol, uint seed, uint mask) From 4f8e413c7d39a4e6687b062b9b855e6a09388067 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Thu, 5 Dec 2019 14:38:41 -0800 Subject: [PATCH 10/17] modify hashing algo for strings --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 83 +++++++++++++++++++++ src/Microsoft.ML.Data/Transforms/Hashing.cs | 2 +- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index 65c0ee789c..e9e70d19d1 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -267,6 +267,89 @@ public static uint MurmurHash(uint hash, ReadOnlySpan span, bool toUpper = return hash; } + public static uint MurmurHashV2(uint hash, ReadOnlySpan span, bool toUpper = false) + { + // Byte length (in pseudo UTF-8 form). + int len = 0; + + // Current bits, value and count. + ulong cur = 0; + int bits = 0; + for (int ich = 0; ich < span.Length; ich++) + { + Contracts.Assert((bits & 0x7) == 0); + Contracts.Assert((uint)bits <= 24); + Contracts.Assert(cur <= 0x00FFFFFF); + + uint ch = toUpper ? char.ToUpperInvariant(span[ich]) : span[ich]; + if (ch <= 0x007F) + { + cur |= ch << bits; + bits += 8; + } + else if (ch <= 0x07FF) + { + cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x1F00) | 0xC080) << bits; + bits += 16; + } + else if (ch <= 0xFFFF) + { + //Contracts.Assert(ch <= 0xFFFF); + cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits; + bits += 24; + } + else + { + Contracts.Assert(ch <= 0x10FFFF); + cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x3F0000) | ((ch << 6) & 0x07000000) | 0xF0808080) << bits; + bits += 32; + } + + if (bits >= 32) + { + hash = MurmurRound(hash, (uint)cur); + cur = cur >> 32; + bits -= 32; + len += 4; + } + } + Contracts.Assert((bits & 0x7) == 0); + Contracts.Assert((uint)bits <= 24); + Contracts.Assert(cur <= 0x00FFFFFF); + + if (bits > 0) + { + //hash = MurmurRound(hash, (uint)cur); + len += bits / 8; + } + + // Encode the length. + //hash = MurmurRound(hash, (uint)len); + + // tail processing + uint k1 = 0; + switch (len & 3) + { + case 3: + k1 ^= (uint)((cur & 0xFF) << 16); + goto case 2; + case 2: + k1 ^= (uint)((cur >> 8) & 0xFF) << 8; + goto case 1; + case 1: + k1 ^= (uint)(cur >> 16) & 0xFF; + k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15); + k1 *= 0x1B873593; + hash ^= k1; + break; + } + // Final mixing ritual for the hash. + hash ^= (uint)len; + hash = MixHash(hash); + + return hash; + } + /// /// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding. /// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th. diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index e1e16493e2..be806f084c 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -632,7 +632,7 @@ public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) - => value.IsEmpty ? 0 : (Hashing.MurmurHash(seed, value.Span.Trim(' ')) & mask) + 1; + => value.IsEmpty ? 0 : (Hashing.MurmurHashV2(seed, value.Span) & mask); } private readonly struct HashKey1 : IHasher From a7b06d4aedc62b80818a978f801c58aa3169701d Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Fri, 6 Dec 2019 14:29:58 -0800 Subject: [PATCH 11/17] remove MurmurRoundV2 --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 101 +------------------- src/Microsoft.ML.Data/Transforms/Hashing.cs | 40 ++++---- 2 files changed, 22 insertions(+), 119 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index e9e70d19d1..4f9bf3d170 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -106,98 +106,6 @@ public static uint MurmurRound(uint hash, uint chunk) return hash; } - public static uint MurmurRoundV2(uint hash, uint key) - { - var chunk = key; - - chunk *= 0xCC9E2D51; - chunk = Rotate(chunk, 15); - chunk *= 0x1B873593; - - hash ^= chunk; - hash = Rotate(hash, 13); - hash *= 5; - hash += 0xE6546B64; - - return hash; - } - /* - public static uint MurmurRoundFloat(uint hash, float chunk) - { - var floatBytes = BitConverter.GetBytes(chunk); - Span key = new Span(floatBytes); - return MurmurRoundSpanV2(hash, key, 4); - } - - public static uint MurmurRoundDouble(uint hash, double chunk) - { - var doubleBytes = BitConverter.GetBytes(chunk); - Span key = new Span(doubleBytes); - return MurmurRoundSpanV2(hash, key, 8); - } - - public static uint MurmurRoundText(uint hash, ReadOnlyMemory chunk) - { - byte[] utf8Bytes = Encoding.UTF8.GetBytes(chunk.ToArray()); - var key = new Span(utf8Bytes); - return MurmurRoundSpanV2(hash, key, chunk.Length); - } - - public static uint MurmurRoundU1(uint hash, byte chunk) - { - Span key = new Span(new byte[] { chunk }); - return MurmurRoundSpanV2(hash, key, 1); - } - - public static uint MurmurRoundU2(uint hash, ushort chunk) - { - var ushortBytes = BitConverter.GetBytes(chunk); - Span key = new Span(ushortBytes); - return MurmurRoundSpanV2(hash, key, 2); - } - - public static uint MurmurRoundU4(uint hash, uint chunk) - { - var uintBytes = BitConverter.GetBytes(chunk); - Span key = new Span(uintBytes); - return MurmurRoundSpanV2(hash, key, 4); - } - - public static uint MurmurRoundU8(uint hash, ulong chunk) - { - var ulongBytes = BitConverter.GetBytes(chunk); - Span key = new Span(ulongBytes); - return MurmurRoundSpanV2(hash, key, 8); - } - - public static uint MurmurRoundI1(uint hash, sbyte chunk) - { - var sbyteBytes = BitConverter.GetBytes(chunk); - Span key = new Span(sbyteBytes); - return MurmurRoundSpanV2(hash, key, 1); - } - - public static uint MurmurRoundI2(uint hash, short chunk) - { - var shortBytes = BitConverter.GetBytes(chunk); - Span key = new Span(shortBytes); - return MurmurRoundSpanV2(hash, key, 2); - } - - public static uint MurmurRoundI4(uint hash, int chunk) - { - var intBytes = BitConverter.GetBytes(chunk); - Span key = new Span(intBytes); - return MurmurRoundSpanV2(hash, key, 4); - } - - public static uint MurmurRoundI8(uint hash, long chunk) - { - var longBytes = BitConverter.GetBytes(chunk); - Span key = new Span(longBytes); - return MurmurRoundSpanV2(hash, key, 8); - } - */ /// /// Implements the murmur hash 3 algorithm, using a mock UTF-8 encoding. /// The UTF-8 conversion ignores the possibilities of unicode planes other than the 0th. @@ -294,7 +202,6 @@ public static uint MurmurHashV2(uint hash, ReadOnlySpan span, bool toUpper } else if (ch <= 0xFFFF) { - //Contracts.Assert(ch <= 0xFFFF); cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits; bits += 24; } @@ -319,13 +226,9 @@ public static uint MurmurHashV2(uint hash, ReadOnlySpan span, bool toUpper if (bits > 0) { - //hash = MurmurRound(hash, (uint)cur); len += bits / 8; } - // Encode the length. - //hash = MurmurRound(hash, (uint)len); - // tail processing uint k1 = 0; switch (len & 3) @@ -343,9 +246,9 @@ public static uint MurmurHashV2(uint hash, ReadOnlySpan span, bool toUpper hash ^= k1; break; } + // Final mixing ritual for the hash. - hash ^= (uint)len; - hash = MixHash(hash); + hash = MixHashV2(hash, len); return hash; } diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index be806f084c..e076493fc6 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -583,7 +583,7 @@ public uint HashCore(uint seed, uint mask, in float value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in float value) - => float.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, FloatUtils.GetBits(value == 0 ? 0 : value)), 4) & mask); + => float.IsNaN(value) ? 0 : (Hashing.MixHashV2(Hashing.MurmurRound(seed, FloatUtils.GetBits(value == 0 ? 0 : value)), 4) & mask); } private readonly struct HashDouble : IHasher @@ -613,11 +613,11 @@ public uint HashCore(uint seed, uint mask, in double value) return 0; ulong v = FloatUtils.GetBits(value == 0 ? 0 : value); - var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo(v)); + var hash = Hashing.MurmurRound(seed, Utils.GetLo(v)); var hi = Utils.GetHi(v); if (hi != 0) - hash = Hashing.MurmurRoundV2(hash, hi); - return (Hashing.MixHashV2(hash, 8) & mask); + hash = Hashing.MurmurRound(hash, hi); + return (Hashing.MixHashV2(hash, 4) & mask); } } @@ -646,7 +646,7 @@ public uint HashCore(uint seed, uint mask, in byte value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in byte value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 1) & mask) + 1; + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRound(seed, value), 4) & mask) + 1; } private readonly struct HashKey2 : IHasher @@ -660,7 +660,7 @@ public uint HashCore(uint seed, uint mask, in ushort value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ushort value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask) + 1; + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRound(seed, value), 4) & mask) + 1; } private readonly struct HashKey4 : IHasher @@ -674,7 +674,7 @@ public uint HashCore(uint seed, uint mask, in uint value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in uint value) - => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask) + 1; + => value == 0 ? 0 : (Hashing.MixHashV2(Hashing.MurmurRound(seed, value), 4) & mask) + 1; } private readonly struct HashKey8 : IHasher @@ -699,10 +699,10 @@ public uint HashCore(uint seed, uint mask, in ulong value) { if (value == 0) return 0; - var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo(value)); + var hash = Hashing.MurmurRound(seed, Utils.GetLo(value)); var hi = Utils.GetHi(value); if (hi != 0) - hash = Hashing.MurmurRoundV2(hash, hi); + hash = Hashing.MurmurRound(hash, hi); return (Hashing.MixHashV2(hash, 4) & mask) + 1; } } @@ -718,7 +718,7 @@ public uint HashCore(uint seed, uint mask, in byte value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in byte value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, value), 4) & mask); } private readonly struct HashU2 : IHasher @@ -732,7 +732,7 @@ public uint HashCore(uint seed, uint mask, in ushort value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ushort value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, value), 4) & mask); } private readonly struct HashU4 : IHasher @@ -746,7 +746,7 @@ public uint HashCore(uint seed, uint mask, in uint value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in uint value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value), 4)) & mask; + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, value), 4)) & mask; } private readonly struct HashU8 : IHasher @@ -767,10 +767,10 @@ public uint HashCore(uint seed, uint mask, in ulong value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ulong value) { - var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo(value)); + var hash = Hashing.MurmurRound(seed, Utils.GetLo(value)); var hi = Utils.GetHi(value); if (hi != 0) - hash = Hashing.MurmurRoundV2(hash, hi); + hash = Hashing.MurmurRound(hash, hi); return (Hashing.MixHashV2(hash, 4) & mask) + 1; } } @@ -806,7 +806,7 @@ public uint HashCore(uint seed, uint mask, in bool value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in bool value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, value ? 1u : 0u), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, value ? 1u : 0u), 4) & mask); } private readonly struct HashI1 : IHasher @@ -820,7 +820,7 @@ public uint HashCore(uint seed, uint mask, in sbyte value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in sbyte value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, (uint)value), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, (uint)value), 4) & mask); } private readonly struct HashI2 : IHasher @@ -834,7 +834,7 @@ public uint HashCore(uint seed, uint mask, in short value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in short value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, (uint)value), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, (uint)value), 4) & mask); } private readonly struct HashI4 : IHasher @@ -848,7 +848,7 @@ public uint HashCore(uint seed, uint mask, in int value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in int value) - => (Hashing.MixHashV2(Hashing.MurmurRoundV2(seed, (uint)value), 4) & mask); + => (Hashing.MixHashV2(Hashing.MurmurRound(seed, (uint)value), 4) & mask); } private readonly struct HashI8 : IHasher @@ -869,10 +869,10 @@ public uint HashCore(uint seed, uint mask, in long value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in long value) { - var hash = Hashing.MurmurRoundV2(seed, Utils.GetLo((ulong)value)); + var hash = Hashing.MurmurRound(seed, Utils.GetLo((ulong)value)); var hi = Utils.GetHi((ulong)value); if (hi != 0) - hash = Hashing.MurmurRoundV2(hash, hi); + hash = Hashing.MurmurRound(hash, hi); return (Hashing.MixHashV2(hash, 4) & mask); } } From f02737114331d8cbd24a9ce91d38908414cd40d3 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Wed, 11 Dec 2019 10:36:09 -0800 Subject: [PATCH 12/17] Add hashing benchark --- .../Text/MulticlassHashClassification.cs | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 test/Microsoft.ML.Benchmarks/Text/MulticlassHashClassification.cs diff --git a/test/Microsoft.ML.Benchmarks/Text/MulticlassHashClassification.cs b/test/Microsoft.ML.Benchmarks/Text/MulticlassHashClassification.cs new file mode 100644 index 0000000000..044eaa8cf6 --- /dev/null +++ b/test/Microsoft.ML.Benchmarks/Text/MulticlassHashClassification.cs @@ -0,0 +1,46 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using BenchmarkDotNet.Attributes; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.LightGbm; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; +using Microsoft.ML.Transforms; +using Microsoft.ML.TestFrameworkCommon; + +namespace Microsoft.ML.Benchmarks +{ + [Config(typeof(TrainConfig))] + public class MulticlassHashClassificationTrain + { + private string _dataPath_Wiki; + + [GlobalSetup] + public void SetupTrainingSpeedTests() + { + _dataPath_Wiki = BaseTestClass.GetDataPath(TestDatasets.WikiDetox.trainFilename); + + if (!File.Exists(_dataPath_Wiki)) + throw new FileNotFoundException(string.Format(Errors.DatasetNotFound, _dataPath_Wiki)); + } + + [Benchmark] + public void CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass() + { + string cmd = @"CV k=5 data=" + _dataPath_Wiki + + " loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+}" + + " xf=Convert{col=logged_in type=R4}" + + " xf=CategoricalTransform{col=ns}" + + " xf=TextTransform{col=FeaturesText:comment wordExtractor=NGramExtractorTransform{ngram=2}}" + + " xf=Concat{col=Features:FeaturesText,logged_in,ns}" + + " tr=LightGBMMulticlass{iter=10}"; + + var environment = EnvironmentFactory.CreateClassificationEnvironment(); + cmd.ExecuteMamlCommand(environment); + } + } +} From 88412b4cfb5812376b4d3c1d46e3817c22482bc5 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Tue, 17 Dec 2019 16:41:23 -0800 Subject: [PATCH 13/17] Add test for uint --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 59 ++++++++++++++++--- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 14bbd58206..e6c2699c52 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1065,26 +1065,69 @@ private class HashData } [Fact] - public void MurmurHashTest() + public void MurmurHashStringTest() { var mlContext = new MLContext(); var samples = new[] { new HashData {Education = "alibaba".AsMemory()}, - //new DataP {Education = "0-5yrs"}, - //new DataP {Education = "6-11yrs"}, - //new DataP {Education = "6-11yrs"}, - //new DataP {Education = "11-15yrs"} + new HashData {Education = "baba".AsMemory()}, + new HashData {Education = "U+123".AsMemory()}, + new HashData {Education = "djldaoiejffjauhglehdlgh".AsMemory()}, + new HashData {Education = "~".AsMemory()}, + }; + + IDataView data = mlContext.Data.LoadFromEnumerable(samples); + + var hashEstimator = new HashingEstimator(Env, "Education"); + var model = hashEstimator.Fit(data); + var hashTransformedData = model.Transform(data); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + + var onnxFileName = "MurmurHashV2.onnx"; + var onnxTextName = "MurmurHashV2.txt"; + var onnxModelPath = GetOutputPath(onnxFileName); + var onnxTextPath = GetOutputPath(onnxTextName); + + SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + CompareSelectedScalarColumns("Education", outputNames[0], hashTransformedData, onnxResult); + } + Done(); + } + + private class HashNumData + { + public uint Education { get; set; } + } + + [Fact] + public void MurmurHashUIntTest() + { + var mlContext = new MLContext(); + + var samples = new[] + { + new HashNumData {Education = 12}, + new HashNumData {Education = 456}, + new HashNumData {Education = 2}, + new HashNumData {Education = 34556789}, + new HashNumData {Education = 7896}, }; IDataView data = mlContext.Data.LoadFromEnumerable(samples); - //ta.GetRowCursor.Schema[0].Type = KeyDataViewType; var hashEstimator = new HashingEstimator(Env, "Education"); - //var modelPath = "MurmurHashModel.zip"; var model = hashEstimator.Fit(data); - //mlContext.Model.Save(model, data.Schema, modelPath); var hashTransformedData = model.Transform(data); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); From 53696b3d66705c3026669ef81b7b26ef4f1de012 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Tue, 17 Dec 2019 16:42:55 -0800 Subject: [PATCH 14/17] remove comments --- src/Microsoft.ML.Data/Transforms/Hashing.cs | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index e076493fc6..b745cee759 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -632,7 +632,7 @@ public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) { [MethodImpl(MethodImplOptions.AggressiveInlining)] public uint HashCore(uint seed, uint mask, in ReadOnlyMemory value) - => value.IsEmpty ? 0 : (Hashing.MurmurHashV2(seed, value.Span) & mask); + => value.IsEmpty ? 0 : (Hashing.MurmurHashV2(seed, value.Span)); } private readonly struct HashKey1 : IHasher @@ -1087,12 +1087,6 @@ private void AddMetaKeyValues(int i, DataViewSchema.Annotations.Builder builder) private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariable, string dstVariable) { string opType; -/* - opType = "Cast"; - string castOutput = ctx.AddIntermediateVariable(_types[iinfo], "CastOutput", true); - var castNode = ctx.CreateNode(opType, srcVariable, castOutput, ctx.GetNodeName(opType), ""); - var t = _types[iinfo].RawType;//OutputKind.ToInternalDataKind().ToType(); - castNode.AddAttribute("to", t);*/ opType = "MurmurHash3"; string murmurOutput = ctx.AddIntermediateVariable(_types[iinfo], "MurmurOutput", true); @@ -1101,17 +1095,6 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariable, stri var seed = _parent._columns[iinfo].Seed; murmurNode.AddAttribute("seed", seed); - /* - opType = "And"; - var mask = (1U << _parent._columns[iinfo].NumberOfBits) - 1; - string m = ctx.AddInitializer(mask); - string andOutput = ctx.AddIntermediateVariable(_types[iinfo], "andOutput", true); - var andNode = ctx.CreateNode(opType, new[] { murmurOutput, m}, new[] { andOutput}, ctx.GetNodeName(opType), ""); - - opType = "Add"; - string one = ctx.AddInitializer(1); - var addNode = ctx.CreateNode(opType, new[] { andOutput, one }, new[] { dstVariable }, ctx.GetNodeName(opType), "");*/ - return true; } From e2f742c8c6db8bc0b8b14065eb6c9e193fc44176 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Tue, 17 Dec 2019 17:24:00 -0800 Subject: [PATCH 15/17] modify murmur hash for strings --- src/Microsoft.ML.Core/Utilities/Hashing.cs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Core/Utilities/Hashing.cs b/src/Microsoft.ML.Core/Utilities/Hashing.cs index 4f9bf3d170..f551c6ccac 100644 --- a/src/Microsoft.ML.Core/Utilities/Hashing.cs +++ b/src/Microsoft.ML.Core/Utilities/Hashing.cs @@ -198,17 +198,20 @@ public static uint MurmurHashV2(uint hash, ReadOnlySpan span, bool toUpper else if (ch <= 0x07FF) { cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x1F00) | 0xC080) << bits; + cur = (cur & 0xFF) << 8 | cur >> 8; bits += 16; } else if (ch <= 0xFFFF) { cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x0F0000) | 0xE08080) << bits; + cur = (cur & 0xFF) << 16 | ((cur >> 8) & 0xFF) << 8 | cur >> 16; bits += 24; } else { Contracts.Assert(ch <= 0x10FFFF); cur |= (ulong)((ch & 0x003F) | ((ch << 2) & 0x3F00) | ((ch << 4) & 0x3F0000) | ((ch << 6) & 0x07000000) | 0xF0808080) << bits; + cur = (cur & 0xFF) << 24 | ((cur >> 8) & 0xFF) << 16 | ((cur >> 16) & 0xFF) << 8 | cur >> 24; bits += 32; } @@ -234,13 +237,13 @@ public static uint MurmurHashV2(uint hash, ReadOnlySpan span, bool toUpper switch (len & 3) { case 3: - k1 ^= (uint)((cur & 0xFF) << 16); + k1 ^= (uint)(((cur >> 16) & 0xFF) << 16); goto case 2; case 2: k1 ^= (uint)((cur >> 8) & 0xFF) << 8; goto case 1; case 1: - k1 ^= (uint)(cur >> 16) & 0xFF; + k1 ^= (uint)(cur & 0xFF); k1 *= 0xCC9E2D51; k1 = Rotate(k1, 15); k1 *= 0x1B873593; hash ^= k1; From 93caa74c486bf698fe2c7ee6bc2b90a8bb51eff0 Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Thu, 19 Dec 2019 11:22:05 -0800 Subject: [PATCH 16/17] Onnx conversion for key types --- src/Microsoft.ML.Data/Transforms/Hashing.cs | 32 ++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index b745cee759..59f7b020db 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -508,7 +508,7 @@ private ValueGetter> ComposeGetterVecV2(DataViewRow input, int iin else if (rawType == typeof(ushort)) return ComposeGetterVecCore(input, iinfo, srcCol, srcType); else if (rawType == typeof(uint)) - return ComposeGetterVecCore(input, iinfo, srcCol, srcType); + return ComposeGetterVecCore(input, iinfo, srcCol, srcType); else if (rawType == typeof(ulong)) return ComposeGetterVecCore(input, iinfo, srcCol, srcType); else if (rawType == typeof(DataViewRowId)) @@ -1087,14 +1087,38 @@ private void AddMetaKeyValues(int i, DataViewSchema.Annotations.Builder builder) private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariable, string dstVariable) { string opType; + OnnxNode murmurNode; opType = "MurmurHash3"; - string murmurOutput = ctx.AddIntermediateVariable(_types[iinfo], "MurmurOutput", true); - var murmurNode = ctx.CreateNode(opType, srcVariable, dstVariable, ctx.GetNodeName(opType), "com.microsoft"); + if (_types[iinfo].RawType == typeof(KeyDataViewType)) + { + string murmurOutput = ctx.AddIntermediateVariable(_types[iinfo], "MurmurOutput", true); + murmurNode = ctx.CreateNode(opType, srcVariable, murmurOutput, ctx.GetNodeName(opType), "com.microsoft"); + + opType = "Cast"; + string castOutput = ctx.AddIntermediateVariable(_types[iinfo], "CastOutput", true); + var castNode = ctx.CreateNode(opType, murmurOutput, castOutput, ctx.GetNodeName(opType), ""); + var t = NumberDataViewType.Int64.RawType; + castNode.AddAttribute("to", t); + + opType = "Add"; + string addOutput = ctx.AddIntermediateVariable(_types[iinfo], "AddOutput", true); + string one = ctx.AddInitializer(1); + var addNode = ctx.CreateNode(opType, new[] { castOutput, one }, new[] { addOutput }, ctx.GetNodeName(opType), ""); + + opType = "Cast"; + var castNodeFinal = ctx.CreateNode(opType, addOutput, dstVariable, ctx.GetNodeName(opType), ""); + var tFinal = NumberDataViewType.UInt32.RawType; + castNodeFinal.AddAttribute("to", tFinal); + } + else + { + murmurNode = ctx.CreateNode(opType, srcVariable, dstVariable, ctx.GetNodeName(opType), "com.microsoft"); + } + murmurNode.AddAttribute("positive", 1); var seed = _parent._columns[iinfo].Seed; murmurNode.AddAttribute("seed", seed); - return true; } From 2e78045eb211a9a3461c9dc22aab5d0ac47bd25c Mon Sep 17 00:00:00 2001 From: Ksenija Stanojevic Date: Thu, 19 Dec 2019 11:49:34 -0800 Subject: [PATCH 17/17] Remove tests --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 422b754c91..a891dc3dc3 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -416,68 +416,6 @@ public void KeyToVectorWithBagOnnxConversionTest() Done(); } - private static readonly Random random = new Random(); - public static string RandomString(int length) - { - const string chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - return new string(Enumerable.Repeat(chars, length) - .Select(s => s[random.Next(s.Length)]).ToArray()); - } - - public class HashData2 - { - public bool Label { get; set; } - public string Features { get; set; } - } - - public class HashDataNum - { - public bool Label { get; set; } - public uint Features { get; set; } - } - - [Fact] - public void KeyToVectorWithBagHashConversionTest() - { - var mlContext = new MLContext(seed: 1); - int n = 100000000; - var samples = new List(); - for (int i = 0; i < n; i++) - { - samples.Add(new HashData2 { Label = true, Features = RandomString(5) }); - } - var data = mlContext.Data.LoadFromEnumerable(samples); - - var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("Features", null, OneHotEncodingEstimator.OutputKind.Bag) - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); - - var model = pipeline.Fit(data); - //var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numberOfFolds: 5); - - Done(); - } - - [Fact] - public void KeyToVectorWithBagHashNumConversionTest() - { - var mlContext = new MLContext(seed: 1); - int n = 100000000; - var samples = new List(); - for (int i = 0; i < n; i++) - { - samples.Add(new HashDataNum { Label = true, Features = (uint)i }); - } - var data = mlContext.Data.LoadFromEnumerable(samples); - - var pipeline = mlContext.Transforms.Categorical.OneHotHashEncoding("Features", null, OneHotEncodingEstimator.OutputKind.Bag) - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); - - var model = pipeline.Fit(data); - //var cvResults = mlContext.MulticlassClassification.CrossValidate(data, pipeline, numberOfFolds: 5); - - Done(); - } - [Fact] public void InitializerCreationTest() {