From 3707045ff233578ad7a2c8c66b921a8c6278b764 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 19 Mar 2019 17:38:05 -0700 Subject: [PATCH 1/5] Add cancellation signal checkpoints in FastTree. --- .../Dynamic/Trainers/Regression/FastTree.cs | 2 +- docs/samples/Microsoft.ML.Samples/Program.cs | 3 ++- src/Microsoft.ML.FastTree/BoostingFastTree.cs | 3 ++- src/Microsoft.ML.FastTree/FastTree.cs | 2 ++ .../TreeLearners/LeastSquaresRegressionTreeLearner.cs | 8 +++++++- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs index 082bc340f3..5de6624a7e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs @@ -17,7 +17,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training examples. - var examples = GenerateRandomDataPoints(1000); + var examples = GenerateRandomDataPoints(100000); // Convert the examples list to an IDataView object, which is consumable by ML.NET API. var trainingData = mlContext.Data.LoadFromEnumerable(examples); diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index ef67739045..5fb245bd16 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -1,4 +1,5 @@ using Microsoft.ML.Samples.Dynamic; +using Microsoft.ML.Samples.Dynamic.Trainers.Regression; namespace Microsoft.ML.Samples { @@ -6,7 +7,7 @@ internal static class Program { static void Main(string[] args) { - ReplaceMissingValues.Example(); + FastTree.Example(); } } } diff --git a/src/Microsoft.ML.FastTree/BoostingFastTree.cs b/src/Microsoft.ML.FastTree/BoostingFastTree.cs index f211a87cb9..ad4cb08b05 100644 --- a/src/Microsoft.ML.FastTree/BoostingFastTree.cs +++ b/src/Microsoft.ML.FastTree/BoostingFastTree.cs @@ -68,7 +68,8 @@ private protected override TreeLearner ConstructTreeLearner(IChannel ch) FastTreeTrainerOptions.HistogramPoolSize, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.FeatureFractionPerSplit, FastTreeTrainerOptions.FilterZeroLambdas, FastTreeTrainerOptions.AllowEmptyTrees, FastTreeTrainerOptions.GainConfidenceLevel, FastTreeTrainerOptions.MaximumCategoricalGroupCountPerNode, FastTreeTrainerOptions.MaximumCategoricalSplitPointCount, BsrMaxTreeOutput(), ParallelTraining, - FastTreeTrainerOptions.MinimumExampleFractionForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinimumExamplesForCategoricalSplit, FastTreeTrainerOptions.Bias); + FastTreeTrainerOptions.MinimumExampleFractionForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinimumExamplesForCategoricalSplit, + FastTreeTrainerOptions.Bias, Host); } private protected override OptimizationAlgorithm ConstructOptimizationAlgorithm(IChannel ch) diff --git a/src/Microsoft.ML.FastTree/FastTree.cs b/src/Microsoft.ML.FastTree/FastTree.cs index 895e79c466..a83fdab043 100644 --- a/src/Microsoft.ML.FastTree/FastTree.cs +++ b/src/Microsoft.ML.FastTree/FastTree.cs @@ -1919,6 +1919,7 @@ private void InitializeBins(int maxBins, IParallelTraining parallelTraining) List trivialFeatures = new List(); for (iFeature = 0; iFeature < NumFeatures; iFeature++) { + Host.CheckAlive(); if (!localConstructBinFeatures[iFeature]) continue; // The following strange call will actually sparsify. @@ -2230,6 +2231,7 @@ private IEnumerable CreateFlocksCore(IChannel ch, IProgressCha for (; iFeature < featureLim; ++iFeature) { + Host.CheckAlive(); double[] bup = BinUpperBounds[iFeature]; Contracts.Assert(Utils.Size(bup) > 0); if (bup.Length == 1) diff --git a/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs b/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs index cf382ee46f..3221021f51 100644 --- a/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs +++ b/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs @@ -69,6 +69,8 @@ internal class LeastSquaresRegressionTreeLearner : TreeLearner, ILeafSplitStatis protected readonly bool FilterZeros; protected readonly double BsrMaxTreeOutput; + protected readonly IHost Host; + // size of reserved memory private readonly long _sizeOfReservedMemory; @@ -114,12 +116,13 @@ internal class LeastSquaresRegressionTreeLearner : TreeLearner, ILeafSplitStatis /// /// /// + /// Host public LeastSquaresRegressionTreeLearner(Dataset trainData, int numLeaves, int minDocsInLeaf, double entropyCoefficient, double featureFirstUsePenalty, double featureReusePenalty, double softmaxTemperature, int histogramPoolSize, int randomSeed, double splitFraction, bool filterZeros, bool allowEmptyTrees, double gainConfidenceLevel, int maxCategoricalGroupsPerNode, int maxCategoricalSplitPointPerNode, double bsrMaxTreeOutput, IParallelTraining parallelTraining, double minDocsPercentageForCategoricalSplit, - Bundle bundling, int minDocsForCategoricalSplit, double bias) + Bundle bundling, int minDocsForCategoricalSplit, double bias, IHost host) : base(trainData, numLeaves) { MinDocsInLeaf = minDocsInLeaf; @@ -135,6 +138,7 @@ public LeastSquaresRegressionTreeLearner(Dataset trainData, int numLeaves, int m MinDocsForCategoricalSplit = minDocsForCategoricalSplit; Bundling = bundling; Bias = bias; + Host = host; _calculateLeafSplitCandidates = ThreadTaskManager.MakeTask( FindBestThresholdForFlockThreadWorker, TrainData.NumFlocks); @@ -148,6 +152,7 @@ public LeastSquaresRegressionTreeLearner(Dataset trainData, int numLeaves, int m histogramPool[i] = new SufficientStatsBase[TrainData.NumFlocks]; for (int j = 0; j < TrainData.NumFlocks; j++) { + Host.CheckAlive(); var ss = histogramPool[i][j] = TrainData.Flocks[j].CreateSufficientStats(HasWeights); _sizeOfReservedMemory += ss.SizeInBytes(); } @@ -498,6 +503,7 @@ protected virtual void SetBestFeatureForLeaf(LeafSplitCandidates leafSplitCandid /// private void FindBestThresholdForFlockThreadWorker(int flock) { + Host.CheckAlive(); int featureMin = TrainData.FlockToFirstFeature(flock); int featureLim = featureMin + TrainData.Flocks[flock].Count; // Check if any feature is active. From 98f051a61c570f57b0ff1772067f359082526fd0 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 19 Mar 2019 17:41:26 -0700 Subject: [PATCH 2/5] undo temp changes. --- .../Dynamic/Trainers/Regression/FastTree.cs | 2 +- docs/samples/Microsoft.ML.Samples/Program.cs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs index 5de6624a7e..082bc340f3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/FastTree.cs @@ -17,7 +17,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training examples. - var examples = GenerateRandomDataPoints(100000); + var examples = GenerateRandomDataPoints(1000); // Convert the examples list to an IDataView object, which is consumable by ML.NET API. var trainingData = mlContext.Data.LoadFromEnumerable(examples); diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 5fb245bd16..ef67739045 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -1,5 +1,4 @@ using Microsoft.ML.Samples.Dynamic; -using Microsoft.ML.Samples.Dynamic.Trainers.Regression; namespace Microsoft.ML.Samples { @@ -7,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - FastTree.Example(); + ReplaceMissingValues.Example(); } } } From a452597931b60ea2ba198e416bf6dd2df2c739b9 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Tue, 19 Mar 2019 18:22:38 -0700 Subject: [PATCH 3/5] build break. --- src/Microsoft.ML.FastTree/RandomForest.cs | 3 ++- .../TreeLearners/FastForestLeastSquaresTreeLearner.cs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.FastTree/RandomForest.cs b/src/Microsoft.ML.FastTree/RandomForest.cs index d4ddc3b1a9..029f0bf5f9 100644 --- a/src/Microsoft.ML.FastTree/RandomForest.cs +++ b/src/Microsoft.ML.FastTree/RandomForest.cs @@ -68,7 +68,8 @@ private protected override TreeLearner ConstructTreeLearner(IChannel ch) FastTreeTrainerOptions.HistogramPoolSize, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.FeatureFractionPerSplit, FastTreeTrainerOptions.AllowEmptyTrees, FastTreeTrainerOptions.GainConfidenceLevel, FastTreeTrainerOptions.MaximumCategoricalGroupCountPerNode, FastTreeTrainerOptions.MaximumCategoricalSplitPointCount, _quantileEnabled, FastTreeTrainerOptions.NumberOfQuantileSamples, ParallelTraining, - FastTreeTrainerOptions.MinimumExampleFractionForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinimumExamplesForCategoricalSplit, FastTreeTrainerOptions.Bias); + FastTreeTrainerOptions.MinimumExampleFractionForCategoricalSplit, FastTreeTrainerOptions.Bundling, FastTreeTrainerOptions.MinimumExamplesForCategoricalSplit, + FastTreeTrainerOptions.Bias, Host); } internal abstract class RandomForestObjectiveFunction : ObjectiveFunctionBase diff --git a/src/Microsoft.ML.FastTree/Training/TreeLearners/FastForestLeastSquaresTreeLearner.cs b/src/Microsoft.ML.FastTree/Training/TreeLearners/FastForestLeastSquaresTreeLearner.cs index 81c4063729..9c12888f23 100644 --- a/src/Microsoft.ML.FastTree/Training/TreeLearners/FastForestLeastSquaresTreeLearner.cs +++ b/src/Microsoft.ML.FastTree/Training/TreeLearners/FastForestLeastSquaresTreeLearner.cs @@ -15,10 +15,10 @@ internal class RandomForestLeastSquaresTreeLearner : LeastSquaresRegressionTreeL public RandomForestLeastSquaresTreeLearner(Dataset trainData, int numLeaves, int minDocsInLeaf, Double entropyCoefficient, Double featureFirstUsePenalty, Double featureReusePenalty, Double softmaxTemperature, int histogramPoolSize, int randomSeed, Double splitFraction, bool allowEmptyTrees, Double gainConfidenceLevel, int maxCategoricalGroupsPerNode, int maxCategoricalSplitPointsPerNode, bool quantileEnabled, int quantileSampleCount, IParallelTraining parallelTraining, - double minDocsPercentageForCategoricalSplit, Bundle bundling, int minDocsForCategoricalSplit, double bias) + double minDocsPercentageForCategoricalSplit, Bundle bundling, int minDocsForCategoricalSplit, double bias, IHost host) : base(trainData, numLeaves, minDocsInLeaf, entropyCoefficient, featureFirstUsePenalty, featureReusePenalty, softmaxTemperature, histogramPoolSize, randomSeed, splitFraction, false, allowEmptyTrees, gainConfidenceLevel, maxCategoricalGroupsPerNode, maxCategoricalSplitPointsPerNode, -1, parallelTraining, - minDocsPercentageForCategoricalSplit, bundling, minDocsForCategoricalSplit, bias) + minDocsPercentageForCategoricalSplit, bundling, minDocsForCategoricalSplit, bias, host) { _quantileSampleCount = quantileSampleCount; _quantileEnabled = quantileEnabled; From 1109882c8ec5ac81958b5b9a1507046e10710ba1 Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Wed, 20 Mar 2019 14:03:00 -0700 Subject: [PATCH 4/5] Add checkpoint in FindBestThresholdFromHistogram(). --- .../Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs b/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs index 3221021f51..7853ee88bc 100644 --- a/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs +++ b/src/Microsoft.ML.FastTree/Training/TreeLearners/LeastSquaresRegressionTreeLearner.cs @@ -655,6 +655,8 @@ public double CalculateSplittedLeafOutput(int count, double sumTargets, double s protected virtual void FindBestThresholdFromHistogram(SufficientStatsBase histogram, LeafSplitCandidates leafSplitCandidates, int flock) { + Host.CheckAlive(); + // Cache histograms for the parallel interface. int featureMin = TrainData.FlockToFirstFeature(flock); int featureLim = featureMin + TrainData.Flocks[flock].Count; From 67b686463c22aa69a2b8555b111482d915de0a4a Mon Sep 17 00:00:00 2001 From: Zeeshan Siddiqui Date: Fri, 22 Mar 2019 11:44:42 -0700 Subject: [PATCH 5/5] Add checkpoint for disk transpose. --- src/Microsoft.ML.FastTree/FastTree.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Microsoft.ML.FastTree/FastTree.cs b/src/Microsoft.ML.FastTree/FastTree.cs index a83fdab043..cdd6084a5b 100644 --- a/src/Microsoft.ML.FastTree/FastTree.cs +++ b/src/Microsoft.ML.FastTree/FastTree.cs @@ -1334,6 +1334,7 @@ private ValueMapper, VBuffer> GetCopier(DataViewType ite private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxBins, IParallelTraining parallelTraining) { + Host.CheckAlive(); Host.AssertValue(examples); Host.Assert(examples.Schema.Feature.HasValue); @@ -1414,6 +1415,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, features.Length)); while (cursor.MoveNext()) { + Host.CheckAlive(); iFeature = cursor.SlotIndex; if (!localConstructBinFeatures[iFeature]) continue; @@ -1489,6 +1491,8 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB int catRangeIndex = 0; for (iFeature = 0; iFeature < NumFeatures;) { + Host.CheckAlive(); + if (catRangeIndex < CategoricalFeatureIndices.Length && CategoricalFeatureIndices[catRangeIndex] == iFeature) { @@ -1565,6 +1569,7 @@ private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxB { for (int i = 0; i < NumFeatures; i++) { + Host.CheckAlive(); GetFeatureValues(cursor, i, getter, ref temp, ref doubleTemp, copier); double[] upperBounds = BinUpperBounds[i]; Host.AssertValue(upperBounds);