From 17026a464a5652a6cca7fa1c8993a059492e11e6 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Fri, 29 Jun 2018 23:17:23 +0000 Subject: [PATCH 01/10] Adding arguments to PipelineSweep Macro --- .../AutoInference.cs | 28 +- .../AutoMlEngines/DefaultsEngine.cs | 11 +- .../AutoMlEngines/RocketEngine.cs | 34 ++- .../AutoMlEngines/UniformRandomEngine.cs | 9 +- .../AutoMlUtils.cs | 10 +- .../Interfaces/IPipelineOptimizer.cs | 6 +- .../Macros/PipelineSweeperMacro.cs | 94 ++++++- .../PurposeInference.cs | 14 +- .../TransformInference.cs | 10 +- src/Microsoft.ML/CSharpApi.cs | 45 ++++ .../Common/EntryPoints/core_manifest.json | 108 ++++++++ .../TestAutoInference.cs | 242 ++++++++++++++++++ 12 files changed, 574 insertions(+), 37 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 2c94c58348..520224fda5 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -208,6 +208,7 @@ public sealed class AutoMlMlState : IMlState private TransformInference.SuggestedTransform[] _availableTransforms; private RecipeInference.SuggestedRecipe.SuggestedLearner[] _availableLearners; private DependencyMap _dependencyMapping; + private Dictionary _columnPurpose; public IPipelineOptimizer AutoMlEngine { get; set; } public PipelinePattern[] BatchCandidates { get; set; } public SupportedMetric Metric { get; } @@ -370,7 +371,7 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T TransformInference.SuggestedTransform[] existingTransforms = null) { // Infer transforms using experts - var levelTransforms = TransformInference.InferTransforms(_env, data, args); + var levelTransforms = TransformInference.InferTransforms(_env, data, args, this._columnPurpose); // Retain only those transforms inferred which were also passed in. if (existingTransforms != null) @@ -378,11 +379,13 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T return levelTransforms; } - public void InferSearchSpace(int numTransformLevels) + public void InferSearchSpace(int numTransformLevels, Dictionary columnPurpose = null) { var learners = RecipeInference.AllowedLearners(_env, TrainerKind).ToArray(); if (_requestedLearners != null && _requestedLearners.Length > 0) learners = learners.Where(l => _requestedLearners.Contains(l.LearnerName)).ToArray(); + + this._columnPurpose = columnPurpose; ComputeSearchSpace(numTransformLevels, learners, (b, c) => InferAndFilter(b, c)); } @@ -536,7 +539,26 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates) var currentBatchSize = numberOfCandidates; if (_terminator is IterationTerminator itr) currentBatchSize = Math.Min(itr.RemainingIterations(_history), numberOfCandidates); - BatchCandidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize); + BatchCandidates = AutoMlEngine.GetNextCandidates( + _sortedSampledElements.Select(kvp => kvp.Value), + currentBatchSize, + this._columnPurpose); + + var h = _env.Register("AutoMlMlState"); + using (var ch = h.Start("GetNextCandidates")) + { + foreach (var pipeline in BatchCandidates) + { + ch.Info("AutoInference Suggested Transforms."); + int transformK = 0; + foreach (var transform in pipeline.Transforms) + { + transformK += 1; + ch.Info($"Transform {transformK} : {transform.Transform.ToString()}"); + } + } + } + return BatchCandidates; } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs index 1d106e4cef..a9e6a58bc2 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs @@ -33,7 +33,8 @@ public DefaultsEngine(IHostEnvironment env, Arguments args) _currentLearnerIndex = 0; } - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, + Dictionary columnPurpose = null) { var candidates = new List(); @@ -53,7 +54,8 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable do { // Make sure transforms set is valid. Repeat until passes verifier. - pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask), learner, "", Env); + pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask, columnPurpose), + learner, "", Env); valid = PipelineVerifier(pipeline, transformsBitMask); count++; } while (!valid && count <= 1000); @@ -69,7 +71,8 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable return candidates.ToArray(); } - private TransformInference.SuggestedTransform[] SampleTransforms(out long transformsBitMask) + private TransformInference.SuggestedTransform[] SampleTransforms(out long transformsBitMask, + Dictionary columnPurpose = null) { // For now, return all transforms. var sampledTransforms = AvailableTransforms.ToList(); @@ -77,7 +80,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(out long transf // Add final features concat transform. sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, columnPurpose)); return sampledTransforms.ToArray(); } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs index 80388eb53e..dd708a11ba 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs @@ -129,7 +129,8 @@ private void SampleHyperparameters(RecipeInference.SuggestedRecipe.SuggestedLear } private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference.SuggestedRecipe.SuggestedLearner learner, - PipelinePattern[] history, out long transformsBitMask, bool uniformRandomSampling = false) + PipelinePattern[] history, out long transformsBitMask, bool uniformRandomSampling = false, + Dictionary columnPurpose = null) { var sampledTransforms = new List( @@ -187,7 +188,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference // cause an error in verification, since it isn't included in the original // dependency mapping (i.e., its level isn't in the dictionary). sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, columnPurpose)); transformsBitMask = mask; return sampledTransforms.ToArray(); @@ -202,7 +203,8 @@ private RecipeInference.SuggestedRecipe.SuggestedLearner[] GetTopLearners(IEnume .Select(t=>AvailableLearners[t.Index]).ToArray(); } - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, + Dictionary columnPurpose = null) { var prevCandidates = history.ToArray(); @@ -220,10 +222,10 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // number of candidates, using second stage logic. UpdateLearners(GetTopLearners(prevCandidates)); _currentStage++; - return GetNextCandidates(prevCandidates, numCandidates); + return GetNextCandidates(prevCandidates, numCandidates, columnPurpose); } else - return GetInitialPipelines(prevCandidates, remainingNum); + return GetInitialPipelines(prevCandidates, remainingNum, columnPurpose); case (int)Stages.Second: // Second stage: Using top k learners, try random transform configurations. var candidates = new List(); @@ -233,7 +235,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // Get second stage candidates. if (numSecondStageCandidates > 0) - candidates.AddRange(NextCandidates(prevCandidates, numSecondStageCandidates, true, true)); + candidates.AddRange(NextCandidates(prevCandidates, numSecondStageCandidates, true, true, columnPurpose)); // Update stage when no more second stage trials to sample. if (_remainingSecondStageTrials < 1) @@ -242,22 +244,25 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // If the number of requested candidates is smaller than remaining second stage candidates, // draw candidates from remaining pool. if (numThirdStageCandidates > 0) - candidates.AddRange(NextCandidates(prevCandidates, numThirdStageCandidates)); + candidates.AddRange(NextCandidates(prevCandidates, numThirdStageCandidates, false, false, columnPurpose)); return candidates.ToArray(); default: // Sample transforms according to weights and use hyperparameter optimization method. // Third stage samples hyperparameters uniform randomly in KDO, fourth and above do not. - return NextCandidates(prevCandidates, numCandidates); + return NextCandidates(prevCandidates, numCandidates, false, false, columnPurpose); } } - private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates) => - _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)] - .GetNextCandidates(history, numCandidates); + private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates, Dictionary columnPurpose) + { + var engine = _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)]; + return engine.GetNextCandidates(history, numCandidates, columnPurpose); + } private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandidates, - bool defaultHyperParams = false, bool uniformRandomTransforms = false) + bool defaultHyperParams = false, bool uniformRandomTransforms = false, + Dictionary columnPurpose = null) { const int maxNumberAttempts = 10; double[] learnerWeights = LearnerHistoryToWeights(history, IsMaximizingMetric); @@ -294,8 +299,9 @@ private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandi do { // Make sure transforms set is valid and have not seen pipeline before. // Repeat until passes or runs out of chances. - pipeline = new PipelinePattern(SampleTransforms(learner, history, - out var transformsBitMask, uniformRandomTransforms), learner, "", Env); + pipeline = new PipelinePattern( + SampleTransforms(learner, history, out var transformsBitMask, uniformRandomTransforms, columnPurpose), + learner, "", Env); hashKey = GetHashKey(transformsBitMask, learner); valid = PipelineVerifier(pipeline, transformsBitMask) && !VisitedPipelines.Contains(hashKey); count++; diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs index 9f304312c2..6a7a64a606 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs @@ -30,12 +30,13 @@ public UniformRandomEngine(IHostEnvironment env) : base(env, env.Register("UniformRandomEngine(AutoML)")) {} - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, + Dictionary columnPurpose = null) { - return GetRandomPipelines(numberOfCandidates); + return GetRandomPipelines(numberOfCandidates, columnPurpose); } - private PipelinePattern[] GetRandomPipelines(int numOfPipelines) + private PipelinePattern[] GetRandomPipelines(int numOfPipelines, Dictionary columnPurpose = null) { Host.Check(AvailableLearners.All(l => l.PipelineNode != null)); Host.Check(AvailableTransforms.All(t => t.PipelineNode != null)); @@ -66,7 +67,7 @@ private PipelinePattern[] GetRandomPipelines(int numOfPipelines) // Always include features concat transform selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms)); + DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, columnPurpose)); // Compute hash key for checking if we've already seen this pipeline. // However, if we keep missing, don't want to get stuck in infinite loop. diff --git a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs index 6aec714618..7876f5142e 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs @@ -257,7 +257,8 @@ public static long TransformsToBitmask(TransformInference.SuggestedTransform[] t /// (In other words, if there would be nothing for that concatenate transform to do.) /// private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, - IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset) + IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset, + Dictionary columnPurpose = null) { var finalArgs = new TransformInference.Arguments { @@ -266,7 +267,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo ExcludedColumnIndices = excludedColumnIndices }; - var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs); + var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs, columnPurpose); for (int i = 0; i < featuresConcatTransforms.Length; i++) { @@ -282,7 +283,8 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo /// public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, IDataView data, AutoInference.DependencyMap dependencyMapping, TransformInference.SuggestedTransform[] selectedTransforms, - TransformInference.SuggestedTransform[] allTransforms) + TransformInference.SuggestedTransform[] allTransforms, + Dictionary columnPurpose = null) { int level = 1; int atomicGroupLimit = 0; @@ -292,7 +294,7 @@ public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHos atomicGroupLimit = allTransforms.Max(t => t.AtomicGroupId) + 1; } var excludedColumnIndices = GetExcludedColumnIndices(selectedTransforms, data, dependencyMapping); - return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit); + return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit, columnPurpose); } public static IDataView ApplyTransformSet(IHostEnvironment env, IDataView data, TransformInference.SuggestedTransform[] transforms) diff --git a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs index 84603fc017..2a754b2e96 100644 --- a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs +++ b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs @@ -21,7 +21,8 @@ namespace Microsoft.ML.Runtime.PipelineInference /// public interface IPipelineOptimizer { - PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates); + PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, + Dictionary columnPurpose = null); void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, @@ -60,7 +61,8 @@ protected PipelineOptimizerBase(IHostEnvironment env, IHost host) ProbUtils = new SweeperProbabilityUtils(host); } - public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates); + public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, + Dictionary columnPurpose = null); public virtual void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 06c260a054..f158a6eeb2 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -36,6 +36,33 @@ public sealed class Arguments [Argument(ArgumentType.AtMostOnce, HelpText = "Output datasets from previous iteration of sweep.", SortOrder = 7, Hide = true)] public IDataView[] CandidateOutputs; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Ignore'", SortOrder = 8, Hide = true)] + public string[] IgnoreColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 9, Hide = true)] + public string[] NameColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 10, Hide = true)] + public string[] LabelColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 11, Hide = true)] + public string[] NumericFeatureColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 12, Hide = true)] + public string[] CategoricalFeatureColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 13, Hide = true)] + public string[] TextFeatureColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 14, Hide = true)] + public string[] WeightColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 15, Hide = true)] + public string[] GroupIdColumn; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 16, Hide = true)] + public string[] ImagePathColumn; } public sealed class Output @@ -88,6 +115,68 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) return new Output { Results = outputView, State = autoMlState }; } + private static Dictionary SetColumnPurpose(Arguments input) + { + var columnPurpose = new Dictionary(); + if (input.IgnoreColumn != null) + { + foreach (var colName in input.IgnoreColumn) + { + columnPurpose.Add(colName, ColumnPurpose.Ignore); + } + } + if (input.NameColumn != null) + { + foreach (var colName in input.NameColumn) + { + columnPurpose.Add(colName, ColumnPurpose.Name); + } + } + if (input.LabelColumn != null) + { + foreach (var colName in input.LabelColumn) + { + columnPurpose.Add(colName, ColumnPurpose.Label); + } + } + if (input.NumericFeatureColumn != null) + { + foreach (var colName in input.NumericFeatureColumn) + { + columnPurpose.Add(colName, ColumnPurpose.NumericFeature); + } + } + if (input.CategoricalFeatureColumn != null) + { + foreach (var colName in input.CategoricalFeatureColumn) + { + columnPurpose.Add(colName, ColumnPurpose.CategoricalFeature); + } + } + if (input.TextFeatureColumn != null) + { + foreach (var colName in input.TextFeatureColumn) + { + columnPurpose.Add(colName, ColumnPurpose.TextFeature); + } + } + if (input.GroupIdColumn != null) + { + foreach (var colName in input.GroupIdColumn) + { + columnPurpose.Add(colName, ColumnPurpose.GroupId); + } + } + if (input.ImagePathColumn != null) + { + foreach (var colName in input.ImagePathColumn) + { + columnPurpose.Add(colName, ColumnPurpose.ImagePath); + } + } + return columnPurpose; + } + [TlcModule.EntryPoint(Desc = "AutoML pipeline sweeping optimzation macro.", Name = "Models.PipelineSweeper")] public static CommonOutputs.MacroOutput PipelineSweep( IHostEnvironment env, @@ -98,6 +187,9 @@ public static CommonOutputs.MacroOutput PipelineSweep( "Must have a valid AutoML State, or pass arguments to create one."); env.Check(input.BatchSize > 0, "Batch size must be > 0."); + // Get the user-defined column purposes (if any) + var columnPurpose = SetColumnPurpose(input); + // If no current state, create object and set data. if (input.State == null) { @@ -133,7 +225,7 @@ public static CommonOutputs.MacroOutput PipelineSweep( // Make sure search space is defined. If not, infer, // with default number of transform levels. if (!autoMlState.IsSearchSpaceDefined()) - autoMlState.InferSearchSpace(numTransformLevels: 1); + autoMlState.InferSearchSpace(numTransformLevels: 1, columnPurpose); // Extract performance summaries and assign to previous candidate pipelines. foreach (var pipeline in autoMlState.BatchCandidates) diff --git a/src/Microsoft.ML.PipelineInference/PurposeInference.cs b/src/Microsoft.ML.PipelineInference/PurposeInference.cs index 7858f1d12b..e34f6bb808 100644 --- a/src/Microsoft.ML.PipelineInference/PurposeInference.cs +++ b/src/Microsoft.ML.PipelineInference/PurposeInference.cs @@ -318,8 +318,10 @@ private static IEnumerable GetExperts() /// The data to use for inference. /// Indices of columns that we're interested in. /// Additional arguments to inference. + /// (Optional) User defined mapping of Column to Purpose. /// The result includes the array of auto-detected column purposes. - public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data, IEnumerable columnIndices, Arguments args) + public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data, IEnumerable columnIndices, Arguments args, + Dictionary columnPurpose = null) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("InferPurposes"); @@ -333,6 +335,16 @@ public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data var cols = columnIndices.Select(x => new IntermediateColumn(takenData, x)).ToArray(); data = takenData; + // Instantiate with purpose provided in columnPurpose (if provided) + if (columnPurpose != null && columnPurpose.Count > 0) + { + foreach (var col in cols) + { + if (columnPurpose.ContainsKey(col.ColumnName)) + col.SuggestedPurpose = columnPurpose[col.ColumnName]; + } + } + foreach (var expert in GetExperts()) { using (var expertChannel = host.Start(expert.GetType().ToString())) diff --git a/src/Microsoft.ML.PipelineInference/TransformInference.cs b/src/Microsoft.ML.PipelineInference/TransformInference.cs index 6390139030..98fb76c7ac 100644 --- a/src/Microsoft.ML.PipelineInference/TransformInference.cs +++ b/src/Microsoft.ML.PipelineInference/TransformInference.cs @@ -1559,7 +1559,8 @@ public static InferenceResult InferTransforms(IHostEnvironment env, IDataView da } } - public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataView data, Arguments args) + public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataView data, Arguments args, + Dictionary columnPurpose = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferTransforms"); @@ -1576,7 +1577,7 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi // Infer column purposes from data sample. var piArgs = new PurposeInference.Arguments { MaxRowsToRead = MaxRowsToRead }; var columnIndices = Enumerable.Range(0, dataSample.Schema.ColumnCount); - var piResult = PurposeInference.InferPurposes(env, dataSample, columnIndices, piArgs); + var piResult = PurposeInference.InferPurposes(env, dataSample, columnIndices, piArgs, columnPurpose); var purposes = piResult.Columns; // Infer transforms @@ -1595,7 +1596,8 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi .Contains(t.AtomicGroupId)).ToArray(); } - public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args) + public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args, + Dictionary columnPurpose = null) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferConcatNumericFeatures"); @@ -1608,7 +1610,7 @@ public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment e // Infer column purposes from data sample. var piArgs = new PurposeInference.Arguments { MaxRowsToRead = MaxRowsToRead }; var columnIndices = Enumerable.Range(0, data.Schema.ColumnCount); - var piResult = PurposeInference.InferPurposes(env, data, columnIndices, piArgs); + var piResult = PurposeInference.InferPurposes(env, data, columnIndices, piArgs, columnPurpose); var purposes = piResult.Columns; var cols = purposes.Where(x => !data.Schema.IsHidden(x.ColumnIndex) diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index 12402b3993..5e5a837e58 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -3410,6 +3410,51 @@ public sealed partial class PipelineSweeper /// public ArrayVar CandidateOutputs { get; set; } = new ArrayVar(); + /// + /// Column(s) to use as purpose 'Ignore' + /// + public string[] IgnoreColumn { get; set; } + + /// + /// Column(s) to use as purpose 'Name' + /// + public string[] NameColumn { get; set; } + + /// + /// Column(s) to use as purpose 'Label' + /// + public string[] LabelColumn { get; set; } + + /// + /// Column(s) to use as purpose 'NumericFeature' + /// + public string[] NumericFeatureColumn { get; set; } + + /// + /// Column(s) to use as purpose 'CategoricalFeature' + /// + public string[] CategoricalFeatureColumn { get; set; } + + /// + /// Column(s) to use as purpose 'TextFeature' + /// + public string[] TextFeatureColumn { get; set; } + + /// + /// Column(s) to use as purpose 'Weight' + /// + public string[] WeightColumn { get; set; } + + /// + /// Column(s) to use as purpose 'GroupId' + /// + public string[] GroupIdColumn { get; set; } + + /// + /// Column(s) to use as purpose 'ImagePath' + /// + public string[] ImagePathColumn { get; set; } + public sealed class Output { diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 0acb5971b0..8bb3fce647 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -2727,6 +2727,114 @@ "SortOrder": 7.0, "IsNullable": false, "Default": null + }, + { + "Name": "IgnoreColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'Ignore'", + "Required": false, + "SortOrder": 8.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "NameColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'Name'", + "Required": false, + "SortOrder": 9.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LabelColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'Label'", + "Required": false, + "SortOrder": 10.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "NumericFeatureColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'NumericFeature'", + "Required": false, + "SortOrder": 11.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "CategoricalFeatureColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'CategoricalFeature'", + "Required": false, + "SortOrder": 12.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "TextFeatureColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'TextFeature'", + "Required": false, + "SortOrder": 13.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "WeightColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'Weight'", + "Required": false, + "SortOrder": 14.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "GroupIdColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'GroupId'", + "Required": false, + "SortOrder": 15.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "ImagePathColumn", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column(s) to use as purpose 'ImagePath'", + "Required": false, + "SortOrder": 16.0, + "IsNullable": false, + "Default": null } ], "Outputs": [ diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index 77697eea9f..ba7ed5098a 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -272,6 +272,248 @@ public void EntryPointPipelineSweep() Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); } + [Fact] + [TestCategory("EntryPoints")] + public void EntryPointPipelineSweepColumnPurposeDefaults() + { + // Get datasets + var pathData = GetDataPath("adult.train"); + var pathDataTest = GetDataPath("adult.test"); + const int numOfSampleRows = 1000; + int numIterations = 4; + const string schema = + "sep=, col=age:R4:0 col=fnlwgt:R4:2 col=education_num:R4:4 col=Features:R4:10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=IsOver50K:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'IgnoreColumn': ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'ethnicity', 'sex'], + 'NumericFeatureColumn': ['age'], + 'LabelColumn': ['IsOver50K'], + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Defaults' + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 4 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer' + } + }, + 'BatchSize': 2 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var allPipelines = autoMlState.GetAllEvaluatedPipelines(); + var bestPipeline = autoMlState.GetBestPipeline(); + Assert.Equal(allPipelines.Length, numIterations); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + + var results = runner.GetOutput("ResultsOut"); + Assert.NotNull(results); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); + } + + [Fact] + [TestCategory("EntryPoints")] + public void EntryPointPipelineSweepColumnPurposeRocket() + { + // Get datasets + var pathData = GetDataPath("adult.train"); + var pathDataTest = GetDataPath("adult.test"); + const int numOfSampleRows = 1000; + int numIterations = 4; + const string schema = + "sep=, col=age:R4:0 col=fnlwgt:R4:2 col=education_num:R4:4 col=Features:R4:10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=IsOver50K:R4:14 header=+"; + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'IgnoreColumn': ['age', 'fnlwgt', 'education_num', 'native_country'], + 'LabelColumn': ['IsOver50K'], + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'Rocket' + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 4 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer' + } + }, + 'BatchSize': 2 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var allPipelines = autoMlState.GetAllEvaluatedPipelines(); + var bestPipeline = autoMlState.GetBestPipeline(); + Assert.Equal(allPipelines.Length, numIterations); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + + var results = runner.GetOutput("ResultsOut"); + Assert.NotNull(results); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); + } + + [Fact] + [TestCategory("EntryPoints")] + public void EntryPointPipelineSweepColumnPurposeUniformRandom() + { + // Get datasets + var pathData = GetDataPath("adult.train"); + var pathDataTest = GetDataPath("adult.test"); + const int numOfSampleRows = 1000; + int numIterations = 4; + const string schema = +"sep=, col=age:R4:0 col=fnlwgt:R4:2 col=education_num:R4:4 col=Features:R4:10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + +"col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=IsOver50K:R4:14 header=+"; + + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + // Define entrypoint graph + string inputGraph = @" + { + 'Nodes': [ + { + 'Name': 'Models.PipelineSweeper', + 'Inputs': { + 'TrainingData': '$TrainingData', + 'TestingData': '$TestingData', + 'IgnoreColumn': ['age', 'fnlwgt', 'education_num', 'native_country'], + 'LabelColumn': ['IsOver50K'], + 'StateArguments': { + 'Name': 'AutoMlState', + 'Settings': { + 'Metric': 'Auc', + 'Engine': { + 'Name': 'UniformRandom' + }, + 'TerminatorArgs': { + 'Name': 'IterationLimited', + 'Settings': { + 'FinalHistoryLength': 4 + } + }, + 'TrainerKind': 'SignatureBinaryClassifierTrainer' + } + }, + 'BatchSize': 2 + }, + 'Outputs': { + 'State': '$StateOut', + 'Results': '$ResultsOut' + } + }, + ] + }"; + + JObject graph = JObject.Parse(inputGraph); + var catalog = ModuleCatalog.CreateInstance(Env); + + var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + runner.SetInput("TrainingData", datasetTrain); + runner.SetInput("TestingData", datasetTest); + runner.RunAll(); + + var autoMlState = runner.GetOutput("StateOut"); + Assert.NotNull(autoMlState); + var allPipelines = autoMlState.GetAllEvaluatedPipelines(); + var bestPipeline = autoMlState.GetBestPipeline(); + Assert.Equal(allPipelines.Length, numIterations); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + + var results = runner.GetOutput("ResultsOut"); + Assert.NotNull(results); + var rows = PipelinePattern.ExtractResults(Env, results, + "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); + } + [Fact] public void TestRocketPipelineEngine() { From c76181cbb501fa26921adbda647bcf9f236514c7 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Sat, 30 Jun 2018 01:03:29 +0000 Subject: [PATCH 02/10] updating the unit tests --- test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index ba7ed5098a..ac2ffdd5a3 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -301,8 +301,7 @@ public void EntryPointPipelineSweepColumnPurposeDefaults() 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', - 'IgnoreColumn': ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'ethnicity', 'sex'], - 'NumericFeatureColumn': ['age'], + 'IgnoreColumn': ['age', 'fnlwgt', 'education_num', 'native_country'], 'LabelColumn': ['IsOver50K'], 'StateArguments': { 'Name': 'AutoMlState', @@ -343,7 +342,7 @@ public void EntryPointPipelineSweepColumnPurposeDefaults() var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); - Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.87); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); @@ -423,7 +422,7 @@ public void EntryPointPipelineSweepColumnPurposeRocket() var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); - Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.87); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); @@ -504,7 +503,7 @@ public void EntryPointPipelineSweepColumnPurposeUniformRandom() var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); - Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.1); + Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.86); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); From 28371fe26346dda6708f816e2102425aa08fdf5f Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Sun, 1 Jul 2018 02:15:59 +0000 Subject: [PATCH 03/10] taking care of review comments; adding validations for Label, Weight, GroupId and Name columns --- .../Macros/PipelineSweeperMacro.cs | 27 +++++++++---------- .../TestAutoInference.cs | 18 ++++++++++--- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index f158a6eeb2..3e4e85acd8 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -115,7 +115,7 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) return new Output { Results = outputView, State = autoMlState }; } - private static Dictionary SetColumnPurpose(Arguments input) + private static Dictionary SetColumnPurpose(IHostEnvironment env, Arguments input) { var columnPurpose = new Dictionary(); if (input.IgnoreColumn != null) @@ -127,17 +127,13 @@ private static Dictionary SetColumnPurpose(Arguments inpu } if (input.NameColumn != null) { - foreach (var colName in input.NameColumn) - { - columnPurpose.Add(colName, ColumnPurpose.Name); - } + env.Check(input.NameColumn.Length == 1, "NameColumn expected one column name to be specified."); + columnPurpose.Add(input.NameColumn[0], ColumnPurpose.Name); } if (input.LabelColumn != null) { - foreach (var colName in input.LabelColumn) - { - columnPurpose.Add(colName, ColumnPurpose.Label); - } + env.Check(input.LabelColumn.Length == 1, "LabelColumn expected one column name to be specified."); + columnPurpose.Add(input.LabelColumn[0], ColumnPurpose.Label); } if (input.NumericFeatureColumn != null) { @@ -160,12 +156,15 @@ private static Dictionary SetColumnPurpose(Arguments inpu columnPurpose.Add(colName, ColumnPurpose.TextFeature); } } + if (input.WeightColumn != null) + { + env.Check(input.WeightColumn.Length == 1, "WeightColumn expected one column name to be specified."); + columnPurpose.Add(input.WeightColumn[0], ColumnPurpose.Weight); + } if (input.GroupIdColumn != null) { - foreach (var colName in input.GroupIdColumn) - { - columnPurpose.Add(colName, ColumnPurpose.GroupId); - } + env.Check(input.GroupIdColumn.Length == 1, "GroupIdColumn expected one column name to be specified."); + columnPurpose.Add(input.GroupIdColumn[0], ColumnPurpose.GroupId); } if (input.ImagePathColumn != null) { @@ -188,7 +187,7 @@ public static CommonOutputs.MacroOutput PipelineSweep( env.Check(input.BatchSize > 0, "Batch size must be > 0."); // Get the user-defined column purposes (if any) - var columnPurpose = SetColumnPurpose(input); + var columnPurpose = SetColumnPurpose(env, input); // If no current state, create object and set data. if (input.State == null) diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index ac2ffdd5a3..a4267c40ba 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -342,7 +342,11 @@ public void EntryPointPipelineSweepColumnPurposeDefaults() var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); - Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.87); + + var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; + var testAuc = bestPipeline.PerformanceSummary.MetricValue; + Assert.True((0.91 < trainAuc) && (trainAuc < 0.92)); + Assert.True((0.87 < testAuc) && (testAuc < 0.88)); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); @@ -422,7 +426,11 @@ public void EntryPointPipelineSweepColumnPurposeRocket() var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); - Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.87); + + var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; + var testAuc = bestPipeline.PerformanceSummary.MetricValue; + Assert.True((0.91 < trainAuc) && (trainAuc < 0.92)); + Assert.True((0.87 < testAuc) && (testAuc < 0.88)); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); @@ -503,7 +511,11 @@ public void EntryPointPipelineSweepColumnPurposeUniformRandom() var allPipelines = autoMlState.GetAllEvaluatedPipelines(); var bestPipeline = autoMlState.GetBestPipeline(); Assert.Equal(allPipelines.Length, numIterations); - Assert.True(bestPipeline.PerformanceSummary.MetricValue > 0.86); + + var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; + var testAuc = bestPipeline.PerformanceSummary.MetricValue; + Assert.True((0.91 < trainAuc) && (trainAuc < 0.92)); + Assert.True((0.86 < testAuc) && (testAuc < 0.87)); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); From 310b395ee1df67a0931266313fff0ac27d51cabd Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Mon, 2 Jul 2018 22:58:41 +0000 Subject: [PATCH 04/10] taking care of some review comments --- .../AutoInference.cs | 16 ++--- .../AutoMlEngines/DefaultsEngine.cs | 8 +-- .../AutoMlEngines/RocketEngine.cs | 21 +++--- .../AutoMlEngines/UniformRandomEngine.cs | 7 +- .../Interfaces/IPipelineOptimizer.cs | 1 + .../Macros/PipelineSweeperMacro.cs | 66 +++++++++---------- src/Microsoft.ML/CSharpApi.cs | 18 ++--- .../Common/EntryPoints/core_manifest.json | 18 ++--- .../TestAutoInference.cs | 12 ++-- 9 files changed, 84 insertions(+), 83 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 520224fda5..6c45c00ce8 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -205,6 +205,7 @@ public sealed class AutoMlMlState : IMlState private IDataView _transformedData; private ITerminator _terminator; private string[] _requestedLearners; + private int _pipelineId; private TransformInference.SuggestedTransform[] _availableTransforms; private RecipeInference.SuggestedRecipe.SuggestedLearner[] _availableLearners; private DependencyMap _dependencyMapping; @@ -371,7 +372,7 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T TransformInference.SuggestedTransform[] existingTransforms = null) { // Infer transforms using experts - var levelTransforms = TransformInference.InferTransforms(_env, data, args, this._columnPurpose); + var levelTransforms = TransformInference.InferTransforms(_env, data, args, _columnPurpose); // Retain only those transforms inferred which were also passed in. if (existingTransforms != null) @@ -385,7 +386,7 @@ public void InferSearchSpace(int numTransformLevels, Dictionary 0) learners = learners.Where(l => _requestedLearners.Contains(l.LearnerName)).ToArray(); - this._columnPurpose = columnPurpose; + _columnPurpose = columnPurpose; ComputeSearchSpace(numTransformLevels, learners, (b, c) => InferAndFilter(b, c)); } @@ -542,20 +543,19 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates) BatchCandidates = AutoMlEngine.GetNextCandidates( _sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize, - this._columnPurpose); + _columnPurpose); - var h = _env.Register("AutoMlMlState"); - using (var ch = h.Start("GetNextCandidates")) + using (var ch = _host.Start("Print suggested transforms")) { foreach (var pipeline in BatchCandidates) { - ch.Info("AutoInference Suggested Transforms."); + ch.Info($"AutoInference Pipeline : {_pipelineId++}"); int transformK = 0; foreach (var transform in pipeline.Transforms) { - transformK += 1; - ch.Info($"Transform {transformK} : {transform.Transform.ToString()}"); + ch.Info($"AutoInference Transform {transformK++} : {transform.Transform}"); } + ch.Info($"AutoInference Learner : {pipeline.Learner}"); } } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs index a9e6a58bc2..20fac94889 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs @@ -34,9 +34,10 @@ public DefaultsEngine(IHostEnvironment env, Arguments args) } public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, - Dictionary columnPurpose = null) + Dictionary colPurpose = null) { var candidates = new List(); + columnPurpose = colPurpose; while (candidates.Count < numCandidates) { @@ -54,7 +55,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable do { // Make sure transforms set is valid. Repeat until passes verifier. - pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask, columnPurpose), + pipeline = new PipelinePattern(SampleTransforms(out var transformsBitMask), learner, "", Env); valid = PipelineVerifier(pipeline, transformsBitMask); count++; @@ -71,8 +72,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable return candidates.ToArray(); } - private TransformInference.SuggestedTransform[] SampleTransforms(out long transformsBitMask, - Dictionary columnPurpose = null) + private TransformInference.SuggestedTransform[] SampleTransforms(out long transformsBitMask) { // For now, return all transforms. var sampledTransforms = AvailableTransforms.ToList(); diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs index dd708a11ba..e32f7f5fd8 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs @@ -129,8 +129,7 @@ private void SampleHyperparameters(RecipeInference.SuggestedRecipe.SuggestedLear } private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference.SuggestedRecipe.SuggestedLearner learner, - PipelinePattern[] history, out long transformsBitMask, bool uniformRandomSampling = false, - Dictionary columnPurpose = null) + PipelinePattern[] history, out long transformsBitMask, bool uniformRandomSampling = false) { var sampledTransforms = new List( @@ -204,9 +203,10 @@ private RecipeInference.SuggestedRecipe.SuggestedLearner[] GetTopLearners(IEnume } public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, - Dictionary columnPurpose = null) + Dictionary colPurpose = null) { var prevCandidates = history.ToArray(); + columnPurpose = colPurpose; switch (_currentStage) { @@ -225,7 +225,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable return GetNextCandidates(prevCandidates, numCandidates, columnPurpose); } else - return GetInitialPipelines(prevCandidates, remainingNum, columnPurpose); + return GetInitialPipelines(prevCandidates, remainingNum); case (int)Stages.Second: // Second stage: Using top k learners, try random transform configurations. var candidates = new List(); @@ -235,7 +235,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // Get second stage candidates. if (numSecondStageCandidates > 0) - candidates.AddRange(NextCandidates(prevCandidates, numSecondStageCandidates, true, true, columnPurpose)); + candidates.AddRange(NextCandidates(prevCandidates, numSecondStageCandidates, true, true)); // Update stage when no more second stage trials to sample. if (_remainingSecondStageTrials < 1) @@ -244,25 +244,24 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // If the number of requested candidates is smaller than remaining second stage candidates, // draw candidates from remaining pool. if (numThirdStageCandidates > 0) - candidates.AddRange(NextCandidates(prevCandidates, numThirdStageCandidates, false, false, columnPurpose)); + candidates.AddRange(NextCandidates(prevCandidates, numThirdStageCandidates, false, false)); return candidates.ToArray(); default: // Sample transforms according to weights and use hyperparameter optimization method. // Third stage samples hyperparameters uniform randomly in KDO, fourth and above do not. - return NextCandidates(prevCandidates, numCandidates, false, false, columnPurpose); + return NextCandidates(prevCandidates, numCandidates, false, false); } } - private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates, Dictionary columnPurpose) + private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates) { var engine = _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)]; return engine.GetNextCandidates(history, numCandidates, columnPurpose); } private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandidates, - bool defaultHyperParams = false, bool uniformRandomTransforms = false, - Dictionary columnPurpose = null) + bool defaultHyperParams = false, bool uniformRandomTransforms = false) { const int maxNumberAttempts = 10; double[] learnerWeights = LearnerHistoryToWeights(history, IsMaximizingMetric); @@ -300,7 +299,7 @@ private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandi { // Make sure transforms set is valid and have not seen pipeline before. // Repeat until passes or runs out of chances. pipeline = new PipelinePattern( - SampleTransforms(learner, history, out var transformsBitMask, uniformRandomTransforms, columnPurpose), + SampleTransforms(learner, history, out var transformsBitMask, uniformRandomTransforms), learner, "", Env); hashKey = GetHashKey(transformsBitMask, learner); valid = PipelineVerifier(pipeline, transformsBitMask) && !VisitedPipelines.Contains(hashKey); diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs index 6a7a64a606..aebcc1c6a3 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs @@ -31,12 +31,13 @@ public UniformRandomEngine(IHostEnvironment env) {} public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary columnPurpose = null) + Dictionary colPurpose = null) { - return GetRandomPipelines(numberOfCandidates, columnPurpose); + columnPurpose = colPurpose; + return GetRandomPipelines(numberOfCandidates); } - private PipelinePattern[] GetRandomPipelines(int numOfPipelines, Dictionary columnPurpose = null) + private PipelinePattern[] GetRandomPipelines(int numOfPipelines) { Host.Check(AvailableLearners.All(l => l.PipelineNode != null)); Host.Check(AvailableTransforms.All(t => t.PipelineNode != null)); diff --git a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs index 2a754b2e96..a05e9affa8 100644 --- a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs +++ b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs @@ -45,6 +45,7 @@ public abstract class PipelineOptimizerBase : IPipelineOptimizer protected IDataView OriginalData; protected IDataView FullyTransformedData; protected AutoInference.DependencyMap DependencyMapping; + protected Dictionary columnPurpose; protected readonly IHostEnvironment Env; protected readonly IHost Host; protected readonly Dictionary TransformsMaskValidity; diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 3e4e85acd8..05cc914a72 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -38,31 +38,31 @@ public sealed class Arguments public IDataView[] CandidateOutputs; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Ignore'", SortOrder = 8, Hide = true)] - public string[] IgnoreColumn; + public string[] IgnoreColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 9, Hide = true)] - public string[] NameColumn; + public string[] NameColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 10, Hide = true)] - public string[] LabelColumn; + public string[] LabelColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 11, Hide = true)] - public string[] NumericFeatureColumn; + public string[] NumericFeatureColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 12, Hide = true)] - public string[] CategoricalFeatureColumn; + public string[] CategoricalFeatureColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 13, Hide = true)] - public string[] TextFeatureColumn; + public string[] TextFeatureColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 14, Hide = true)] - public string[] WeightColumn; + public string[] WeightColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 15, Hide = true)] - public string[] GroupIdColumn; + public string[] GroupIdColumns; [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 16, Hide = true)] - public string[] ImagePathColumn; + public string[] ImagePathColumns; } public sealed class Output @@ -115,60 +115,60 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) return new Output { Results = outputView, State = autoMlState }; } - private static Dictionary SetColumnPurpose(IHostEnvironment env, Arguments input) + private static Dictionary GetColumnPurpose(IHostEnvironment env, Arguments input) { var columnPurpose = new Dictionary(); - if (input.IgnoreColumn != null) + if (input.IgnoreColumns != null) { - foreach (var colName in input.IgnoreColumn) + foreach (var colName in input.IgnoreColumns) { columnPurpose.Add(colName, ColumnPurpose.Ignore); } } - if (input.NameColumn != null) + if (input.NameColumns != null) { - env.Check(input.NameColumn.Length == 1, "NameColumn expected one column name to be specified."); - columnPurpose.Add(input.NameColumn[0], ColumnPurpose.Name); + env.Check(input.NameColumns.Length == 1, "NameColumn expected one column name to be specified."); + columnPurpose.Add(input.NameColumns[0], ColumnPurpose.Name); } - if (input.LabelColumn != null) + if (input.LabelColumns != null) { - env.Check(input.LabelColumn.Length == 1, "LabelColumn expected one column name to be specified."); - columnPurpose.Add(input.LabelColumn[0], ColumnPurpose.Label); + env.Check(input.LabelColumns.Length == 1, "LabelColumn expected one column name to be specified."); + columnPurpose.Add(input.LabelColumns[0], ColumnPurpose.Label); } - if (input.NumericFeatureColumn != null) + if (input.NumericFeatureColumns != null) { - foreach (var colName in input.NumericFeatureColumn) + foreach (var colName in input.NumericFeatureColumns) { columnPurpose.Add(colName, ColumnPurpose.NumericFeature); } } - if (input.CategoricalFeatureColumn != null) + if (input.CategoricalFeatureColumns != null) { - foreach (var colName in input.CategoricalFeatureColumn) + foreach (var colName in input.CategoricalFeatureColumns) { columnPurpose.Add(colName, ColumnPurpose.CategoricalFeature); } } - if (input.TextFeatureColumn != null) + if (input.TextFeatureColumns != null) { - foreach (var colName in input.TextFeatureColumn) + foreach (var colName in input.TextFeatureColumns) { columnPurpose.Add(colName, ColumnPurpose.TextFeature); } } - if (input.WeightColumn != null) + if (input.WeightColumns != null) { - env.Check(input.WeightColumn.Length == 1, "WeightColumn expected one column name to be specified."); - columnPurpose.Add(input.WeightColumn[0], ColumnPurpose.Weight); + env.Check(input.WeightColumns.Length == 1, "WeightColumn expected one column name to be specified."); + columnPurpose.Add(input.WeightColumns[0], ColumnPurpose.Weight); } - if (input.GroupIdColumn != null) + if (input.GroupIdColumns != null) { - env.Check(input.GroupIdColumn.Length == 1, "GroupIdColumn expected one column name to be specified."); - columnPurpose.Add(input.GroupIdColumn[0], ColumnPurpose.GroupId); + env.Check(input.GroupIdColumns.Length == 1, "GroupIdColumn expected one column name to be specified."); + columnPurpose.Add(input.GroupIdColumns[0], ColumnPurpose.GroupId); } - if (input.ImagePathColumn != null) + if (input.ImagePathColumns != null) { - foreach (var colName in input.ImagePathColumn) + foreach (var colName in input.ImagePathColumns) { columnPurpose.Add(colName, ColumnPurpose.ImagePath); } @@ -187,7 +187,7 @@ public static CommonOutputs.MacroOutput PipelineSweep( env.Check(input.BatchSize > 0, "Batch size must be > 0."); // Get the user-defined column purposes (if any) - var columnPurpose = SetColumnPurpose(env, input); + var columnPurpose = GetColumnPurpose(env, input); // If no current state, create object and set data. if (input.State == null) diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index 5e5a837e58..2480f9100b 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -3413,47 +3413,47 @@ public sealed partial class PipelineSweeper /// /// Column(s) to use as purpose 'Ignore' /// - public string[] IgnoreColumn { get; set; } + public string[] IgnoreColumns { get; set; } /// /// Column(s) to use as purpose 'Name' /// - public string[] NameColumn { get; set; } + public string[] NameColumns { get; set; } /// /// Column(s) to use as purpose 'Label' /// - public string[] LabelColumn { get; set; } + public string[] LabelColumns { get; set; } /// /// Column(s) to use as purpose 'NumericFeature' /// - public string[] NumericFeatureColumn { get; set; } + public string[] NumericFeatureColumns { get; set; } /// /// Column(s) to use as purpose 'CategoricalFeature' /// - public string[] CategoricalFeatureColumn { get; set; } + public string[] CategoricalFeatureColumns { get; set; } /// /// Column(s) to use as purpose 'TextFeature' /// - public string[] TextFeatureColumn { get; set; } + public string[] TextFeatureColumns { get; set; } /// /// Column(s) to use as purpose 'Weight' /// - public string[] WeightColumn { get; set; } + public string[] WeightColumns { get; set; } /// /// Column(s) to use as purpose 'GroupId' /// - public string[] GroupIdColumn { get; set; } + public string[] GroupIdColumns { get; set; } /// /// Column(s) to use as purpose 'ImagePath' /// - public string[] ImagePathColumn { get; set; } + public string[] ImagePathColumns { get; set; } public sealed class Output diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 8bb3fce647..6a174a936b 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -2729,7 +2729,7 @@ "Default": null }, { - "Name": "IgnoreColumn", + "Name": "IgnoreColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2741,7 +2741,7 @@ "Default": null }, { - "Name": "NameColumn", + "Name": "NameColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2753,7 +2753,7 @@ "Default": null }, { - "Name": "LabelColumn", + "Name": "LabelColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2765,7 +2765,7 @@ "Default": null }, { - "Name": "NumericFeatureColumn", + "Name": "NumericFeatureColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2777,7 +2777,7 @@ "Default": null }, { - "Name": "CategoricalFeatureColumn", + "Name": "CategoricalFeatureColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2789,7 +2789,7 @@ "Default": null }, { - "Name": "TextFeatureColumn", + "Name": "TextFeatureColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2801,7 +2801,7 @@ "Default": null }, { - "Name": "WeightColumn", + "Name": "WeightColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2813,7 +2813,7 @@ "Default": null }, { - "Name": "GroupIdColumn", + "Name": "GroupIdColumns", "Type": { "Kind": "Array", "ItemType": "String" @@ -2825,7 +2825,7 @@ "Default": null }, { - "Name": "ImagePathColumn", + "Name": "ImagePathColumns", "Type": { "Kind": "Array", "ItemType": "String" diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index a4267c40ba..bbb97d33f8 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -301,8 +301,8 @@ public void EntryPointPipelineSweepColumnPurposeDefaults() 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', - 'IgnoreColumn': ['age', 'fnlwgt', 'education_num', 'native_country'], - 'LabelColumn': ['IsOver50K'], + 'IgnoreColumns': ['age', 'fnlwgt', 'education_num', 'native_country'], + 'LabelColumns': ['IsOver50K'], 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { @@ -385,8 +385,8 @@ public void EntryPointPipelineSweepColumnPurposeRocket() 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', - 'IgnoreColumn': ['age', 'fnlwgt', 'education_num', 'native_country'], - 'LabelColumn': ['IsOver50K'], + 'IgnoreColumns': ['age', 'fnlwgt', 'education_num', 'native_country'], + 'LabelColumns': ['IsOver50K'], 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { @@ -470,8 +470,8 @@ public void EntryPointPipelineSweepColumnPurposeUniformRandom() 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', - 'IgnoreColumn': ['age', 'fnlwgt', 'education_num', 'native_country'], - 'LabelColumn': ['IsOver50K'], + 'IgnoreColumns': ['age', 'fnlwgt', 'education_num', 'native_country'], + 'LabelColumns': ['IsOver50K'], 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { From 4fbaf14c5050252e588381222ce543cf86d6f0c8 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Mon, 2 Jul 2018 23:19:11 +0000 Subject: [PATCH 05/10] some code cleanup --- src/Microsoft.ML.PipelineInference/AutoInference.cs | 2 +- .../AutoMlEngines/RocketEngine.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 6c45c00ce8..08d934b015 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -545,7 +545,7 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates) currentBatchSize, _columnPurpose); - using (var ch = _host.Start("Print suggested transforms")) + using (var ch = _host.Start("Suggested Pipeline")) { foreach (var pipeline in BatchCandidates) { diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs index e32f7f5fd8..20371bba7c 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs @@ -244,13 +244,13 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // If the number of requested candidates is smaller than remaining second stage candidates, // draw candidates from remaining pool. if (numThirdStageCandidates > 0) - candidates.AddRange(NextCandidates(prevCandidates, numThirdStageCandidates, false, false)); + candidates.AddRange(NextCandidates(prevCandidates, numThirdStageCandidates)); return candidates.ToArray(); default: // Sample transforms according to weights and use hyperparameter optimization method. // Third stage samples hyperparameters uniform randomly in KDO, fourth and above do not. - return NextCandidates(prevCandidates, numCandidates, false, false); + return NextCandidates(prevCandidates, numCandidates); } } From 3ca4da9cb747c08a9141545ef859e8c66877330a Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Tue, 3 Jul 2018 22:03:53 +0000 Subject: [PATCH 06/10] addressing PR comments --- .../AutoInference.cs | 2 +- .../AutoMlEngines/DefaultsEngine.cs | 6 +- .../AutoMlEngines/RocketEngine.cs | 8 +- .../AutoMlEngines/UniformRandomEngine.cs | 6 +- .../AutoMlUtils.cs | 4 +- .../Interfaces/IPipelineOptimizer.cs | 6 +- .../Macros/PipelineSweeperMacro.cs | 18 ++--- .../TransformInference.cs | 2 +- .../TestAutoInference.cs | 75 +++++++++++++++++++ 9 files changed, 101 insertions(+), 26 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 08d934b015..776b44fe89 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -315,7 +315,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows) var currentBatchSize = batchSize; if (_terminator is IterationTerminator itr) currentBatchSize = Math.Min(itr.RemainingIterations(_history), batchSize); - var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize); + var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize, _columnPurpose); // Break if no candidates returned, means no valid pipeline available. if (candidates.Length == 0) diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs index 20fac94889..0396f7f971 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs @@ -34,10 +34,10 @@ public DefaultsEngine(IHostEnvironment env, Arguments args) } public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, - Dictionary colPurpose = null) + Dictionary columnPurpose) { var candidates = new List(); - columnPurpose = colPurpose; + ColumnPurpose = columnPurpose; while (candidates.Count < numCandidates) { @@ -80,7 +80,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(out long transf // Add final features concat transform. sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, columnPurpose)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, ColumnPurpose)); return sampledTransforms.ToArray(); } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs index 20371bba7c..af1f988140 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs @@ -187,7 +187,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference // cause an error in verification, since it isn't included in the original // dependency mapping (i.e., its level isn't in the dictionary). sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, columnPurpose)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, ColumnPurpose)); transformsBitMask = mask; return sampledTransforms.ToArray(); @@ -203,10 +203,10 @@ private RecipeInference.SuggestedRecipe.SuggestedLearner[] GetTopLearners(IEnume } public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, - Dictionary colPurpose = null) + Dictionary columnPurpose) { var prevCandidates = history.ToArray(); - columnPurpose = colPurpose; + ColumnPurpose = columnPurpose; switch (_currentStage) { @@ -257,7 +257,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates) { var engine = _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)]; - return engine.GetNextCandidates(history, numCandidates, columnPurpose); + return engine.GetNextCandidates(history, numCandidates, ColumnPurpose); } private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandidates, diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs index aebcc1c6a3..9ca3e1b95d 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs @@ -31,9 +31,9 @@ public UniformRandomEngine(IHostEnvironment env) {} public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary colPurpose = null) + Dictionary columnPurpose) { - columnPurpose = colPurpose; + ColumnPurpose = columnPurpose; return GetRandomPipelines(numberOfCandidates); } @@ -68,7 +68,7 @@ private PipelinePattern[] GetRandomPipelines(int numOfPipelines) // Always include features concat transform selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, columnPurpose)); + DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, ColumnPurpose)); // Compute hash key for checking if we've already seen this pipeline. // However, if we keep missing, don't want to get stuck in infinite loop. diff --git a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs index 7876f5142e..6e02c9d77f 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs @@ -258,7 +258,7 @@ public static long TransformsToBitmask(TransformInference.SuggestedTransform[] t /// private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset, - Dictionary columnPurpose = null) + Dictionary columnPurpose) { var finalArgs = new TransformInference.Arguments { @@ -284,7 +284,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, IDataView data, AutoInference.DependencyMap dependencyMapping, TransformInference.SuggestedTransform[] selectedTransforms, TransformInference.SuggestedTransform[] allTransforms, - Dictionary columnPurpose = null) + Dictionary columnPurpose) { int level = 1; int atomicGroupLimit = 0; diff --git a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs index a05e9affa8..0c3ec3b424 100644 --- a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs +++ b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs @@ -22,7 +22,7 @@ namespace Microsoft.ML.Runtime.PipelineInference public interface IPipelineOptimizer { PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary columnPurpose = null); + Dictionary columnPurpose); void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, @@ -45,7 +45,7 @@ public abstract class PipelineOptimizerBase : IPipelineOptimizer protected IDataView OriginalData; protected IDataView FullyTransformedData; protected AutoInference.DependencyMap DependencyMapping; - protected Dictionary columnPurpose; + protected Dictionary ColumnPurpose; protected readonly IHostEnvironment Env; protected readonly IHost Host; protected readonly Dictionary TransformsMaskValidity; @@ -63,7 +63,7 @@ protected PipelineOptimizerBase(IHostEnvironment env, IHost host) } public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary columnPurpose = null); + Dictionary columnPurpose); public virtual void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 05cc914a72..9eb9b9ad4c 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -37,31 +37,31 @@ public sealed class Arguments [Argument(ArgumentType.AtMostOnce, HelpText = "Output datasets from previous iteration of sweep.", SortOrder = 7, Hide = true)] public IDataView[] CandidateOutputs; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Ignore'", SortOrder = 8, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Ignore'", SortOrder = 8)] public string[] IgnoreColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 9, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 9)] public string[] NameColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 10, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 10)] public string[] LabelColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 11, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 11)] public string[] NumericFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 12, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 12)] public string[] CategoricalFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 13, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 13)] public string[] TextFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 14, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 14)] public string[] WeightColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 15, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 15)] public string[] GroupIdColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 16, Hide = true)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 16)] public string[] ImagePathColumns; } diff --git a/src/Microsoft.ML.PipelineInference/TransformInference.cs b/src/Microsoft.ML.PipelineInference/TransformInference.cs index 98fb76c7ac..971234f1d8 100644 --- a/src/Microsoft.ML.PipelineInference/TransformInference.cs +++ b/src/Microsoft.ML.PipelineInference/TransformInference.cs @@ -1597,7 +1597,7 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi } public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args, - Dictionary columnPurpose = null) + Dictionary columnPurpose) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferConcatNumericFeatures"); diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index bbb97d33f8..1d850a9f8d 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -125,6 +125,81 @@ public void TestPipelineSweeperMacroNoTransforms() Assert.True(amlsOut.GetBestPipeline().PerformanceSummary.MetricValue > 0.1); } + + [Fact] + [TestCategory("EntryPoints")] + public void TestPipelineSweeperMacroColumnPurpose() + { + // Set up inputs for experiment + string pathData = GetDataPath("adult.train"); + string pathDataTest = GetDataPath("adult.test"); + const int numOfSampleRows = 1000; + const string schema = "sep=, col=F0:R4:0 col=F2:R4:2 col=F4:R4:4 col=F1012:R4:10-12 col=F14:R4:14 header=+"; + + var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); +#pragma warning disable 0618 + var datasetTrain = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); + var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); + var datasetTest = ImportTextData.ImportText(Env, + new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); +#pragma warning restore 0618 + const int batchSize = 5; + const int numIterations = 1; + const int numTransformLevels = 2; + AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; + var columnPurpose = new Dictionary() { + { "F14", ColumnPurpose.Label }, + { "F0", ColumnPurpose.Ignore }, + { "F2", ColumnPurpose.Ignore }, + }; + + // Using the simple, uniform random sampling (with replacement) engine + PipelineOptimizerBase autoMlEngine = new DefaultsEngine(Env, null); + + // Create search object + var amls = new AutoInference.AutoMlMlState(Env, metric, autoMlEngine, new IterationTerminator(numIterations), + MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, datasetTrain, datasetTest); + + // Infer search space + amls.InferSearchSpace(numTransformLevels, columnPurpose); + + // Create macro object + var pipelineSweepInput = new Microsoft.ML.Models.PipelineSweeper() + { + BatchSize = batchSize, + }; + + var exp = new Experiment(Env); + var output = exp.Add(pipelineSweepInput); + exp.Compile(); + exp.SetInput(pipelineSweepInput.TrainingData, datasetTrain); + exp.SetInput(pipelineSweepInput.TestingData, datasetTest); + exp.SetInput(pipelineSweepInput.State, amls); + exp.SetInput(pipelineSweepInput.CandidateOutputs, new IDataView[0]); + exp.Run(); + + // Verify Results. + var results = exp.GetOutput(output.Results); + var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); + Assert.True(rows.Length == numIterations); + Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); + + var bestPipelineJsonGraph = rows[0].GraphJson; + JObject bestPipeline = JObject.Parse(bestPipelineJsonGraph); + + var label = bestPipeline["Nodes"][0]["Inputs"]["Column"][0]["Source"]; + Assert.Equal("F14", label); // Check F14 was picked as Label + + var features = (JArray)bestPipeline["Nodes"][1]["Inputs"]["Column"][0]["Source"]; + Assert.Equal(2, features.Count); // Check we have two features, F4 and F1012. + var features1 = features[0]; + var features2 = features[1]; + Assert.Equal("F4", features1); + Assert.Equal("F1012", features2); + } + + [Fact] [TestCategory("EntryPoints")] public void EntryPointPipelineSweepSerialization() From d55b1e0e124d9ed82955847bb844e95ec844f619 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Wed, 11 Jul 2018 15:53:17 +0000 Subject: [PATCH 07/10] API changes to use RoleMappedData --- .../AutoInference.cs | 15 ++++++--------- .../AutoMlEngines/DefaultsEngine.cs | 8 ++++---- .../AutoMlEngines/RocketEngine.cs | 11 +++++------ .../AutoMlEngines/UniformRandomEngine.cs | 8 ++++---- src/Microsoft.ML.PipelineInference/AutoMlUtils.cs | 10 ++++------ .../Interfaces/IPipelineOptimizer.cs | 8 +++----- .../Macros/PipelineSweeperMacro.cs | 2 +- .../PurposeInference.cs | 14 ++------------ .../TransformInference.cs | 10 ++++------ .../TestAutoInference.cs | 2 +- 10 files changed, 34 insertions(+), 54 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 776b44fe89..4ff8fe510b 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -209,7 +209,7 @@ public sealed class AutoMlMlState : IMlState private TransformInference.SuggestedTransform[] _availableTransforms; private RecipeInference.SuggestedRecipe.SuggestedLearner[] _availableLearners; private DependencyMap _dependencyMapping; - private Dictionary _columnPurpose; + private RoleMappedData _dataRoles; public IPipelineOptimizer AutoMlEngine { get; set; } public PipelinePattern[] BatchCandidates { get; set; } public SupportedMetric Metric { get; } @@ -315,7 +315,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows) var currentBatchSize = batchSize; if (_terminator is IterationTerminator itr) currentBatchSize = Math.Min(itr.RemainingIterations(_history), batchSize); - var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize, _columnPurpose); + var candidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Values, currentBatchSize, _dataRoles); // Break if no candidates returned, means no valid pipeline available. if (candidates.Length == 0) @@ -372,7 +372,7 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T TransformInference.SuggestedTransform[] existingTransforms = null) { // Infer transforms using experts - var levelTransforms = TransformInference.InferTransforms(_env, data, args, _columnPurpose); + var levelTransforms = TransformInference.InferTransforms(_env, data, args, _dataRoles); // Retain only those transforms inferred which were also passed in. if (existingTransforms != null) @@ -380,13 +380,13 @@ private TransformInference.SuggestedTransform[] InferAndFilter(IDataView data, T return levelTransforms; } - public void InferSearchSpace(int numTransformLevels, Dictionary columnPurpose = null) + public void InferSearchSpace(int numTransformLevels, RoleMappedData dataRoles = null) { var learners = RecipeInference.AllowedLearners(_env, TrainerKind).ToArray(); if (_requestedLearners != null && _requestedLearners.Length > 0) learners = learners.Where(l => _requestedLearners.Contains(l.LearnerName)).ToArray(); - _columnPurpose = columnPurpose; + _dataRoles = dataRoles; ComputeSearchSpace(numTransformLevels, learners, (b, c) => InferAndFilter(b, c)); } @@ -540,10 +540,7 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates) var currentBatchSize = numberOfCandidates; if (_terminator is IterationTerminator itr) currentBatchSize = Math.Min(itr.RemainingIterations(_history), numberOfCandidates); - BatchCandidates = AutoMlEngine.GetNextCandidates( - _sortedSampledElements.Select(kvp => kvp.Value), - currentBatchSize, - _columnPurpose); + BatchCandidates = AutoMlEngine.GetNextCandidates(_sortedSampledElements.Select(kvp => kvp.Value), currentBatchSize, _dataRoles); using (var ch = _host.Start("Suggested Pipeline")) { diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs index 0396f7f971..19583cef8c 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/DefaultsEngine.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.PipelineInference; @@ -33,11 +34,10 @@ public DefaultsEngine(IHostEnvironment env, Arguments args) _currentLearnerIndex = 0; } - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, - Dictionary columnPurpose) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, RoleMappedData dataRoles) { var candidates = new List(); - ColumnPurpose = columnPurpose; + DataRoles = dataRoles; while (candidates.Count < numCandidates) { @@ -80,7 +80,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(out long transf // Add final features concat transform. sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, ColumnPurpose)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, DataRoles)); return sampledTransforms.ToArray(); } diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs index af1f988140..f06fa759e8 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/RocketEngine.cs @@ -187,7 +187,7 @@ private TransformInference.SuggestedTransform[] SampleTransforms(RecipeInference // cause an error in verification, since it isn't included in the original // dependency mapping (i.e., its level isn't in the dictionary). sampledTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, ColumnPurpose)); + DependencyMapping, sampledTransforms.ToArray(), AvailableTransforms, DataRoles)); transformsBitMask = mask; return sampledTransforms.ToArray(); @@ -202,11 +202,10 @@ private RecipeInference.SuggestedRecipe.SuggestedLearner[] GetTopLearners(IEnume .Select(t=>AvailableLearners[t.Index]).ToArray(); } - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, - Dictionary columnPurpose) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numCandidates, RoleMappedData dataRoles) { var prevCandidates = history.ToArray(); - ColumnPurpose = columnPurpose; + DataRoles = dataRoles; switch (_currentStage) { @@ -222,7 +221,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable // number of candidates, using second stage logic. UpdateLearners(GetTopLearners(prevCandidates)); _currentStage++; - return GetNextCandidates(prevCandidates, numCandidates, columnPurpose); + return GetNextCandidates(prevCandidates, numCandidates, DataRoles); } else return GetInitialPipelines(prevCandidates, remainingNum); @@ -257,7 +256,7 @@ public override PipelinePattern[] GetNextCandidates(IEnumerable private PipelinePattern[] GetInitialPipelines(IEnumerable history, int numCandidates) { var engine = _secondaryEngines[_randomInit ? nameof(UniformRandomEngine) : nameof(DefaultsEngine)]; - return engine.GetNextCandidates(history, numCandidates, ColumnPurpose); + return engine.GetNextCandidates(history, numCandidates, DataRoles); } private PipelinePattern[] NextCandidates(PipelinePattern[] history, int numCandidates, diff --git a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs index 9ca3e1b95d..23afce66ff 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlEngines/UniformRandomEngine.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.PipelineInference; @@ -30,10 +31,9 @@ public UniformRandomEngine(IHostEnvironment env) : base(env, env.Register("UniformRandomEngine(AutoML)")) {} - public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary columnPurpose) + public override PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, RoleMappedData dataRoles) { - ColumnPurpose = columnPurpose; + DataRoles = dataRoles; return GetRandomPipelines(numberOfCandidates); } @@ -68,7 +68,7 @@ private PipelinePattern[] GetRandomPipelines(int numOfPipelines) // Always include features concat transform selectedTransforms.AddRange(AutoMlUtils.GetFinalFeatureConcat(Env, FullyTransformedData, - DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, ColumnPurpose)); + DependencyMapping, selectedTransforms.ToArray(), AvailableTransforms, DataRoles)); // Compute hash key for checking if we've already seen this pipeline. // However, if we keep missing, don't want to get stuck in infinite loop. diff --git a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs index 6e02c9d77f..5f028835e2 100644 --- a/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs +++ b/src/Microsoft.ML.PipelineInference/AutoMlUtils.cs @@ -257,8 +257,7 @@ public static long TransformsToBitmask(TransformInference.SuggestedTransform[] t /// (In other words, if there would be nothing for that concatenate transform to do.) /// private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, - IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset, - Dictionary columnPurpose) + IDataView dataSample, int[] excludedColumnIndices, int level, int atomicIdOffset, RoleMappedData dataRoles) { var finalArgs = new TransformInference.Arguments { @@ -267,7 +266,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo ExcludedColumnIndices = excludedColumnIndices }; - var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs, columnPurpose); + var featuresConcatTransforms = TransformInference.InferConcatNumericFeatures(env, dataSample, finalArgs, dataRoles); for (int i = 0; i < featuresConcatTransforms.Length; i++) { @@ -283,8 +282,7 @@ private static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHo /// public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHostEnvironment env, IDataView data, AutoInference.DependencyMap dependencyMapping, TransformInference.SuggestedTransform[] selectedTransforms, - TransformInference.SuggestedTransform[] allTransforms, - Dictionary columnPurpose) + TransformInference.SuggestedTransform[] allTransforms, RoleMappedData dataRoles) { int level = 1; int atomicGroupLimit = 0; @@ -294,7 +292,7 @@ public static TransformInference.SuggestedTransform[] GetFinalFeatureConcat(IHos atomicGroupLimit = allTransforms.Max(t => t.AtomicGroupId) + 1; } var excludedColumnIndices = GetExcludedColumnIndices(selectedTransforms, data, dependencyMapping); - return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit, columnPurpose); + return GetFinalFeatureConcat(env, data, excludedColumnIndices, level, atomicGroupLimit, dataRoles); } public static IDataView ApplyTransformSet(IHostEnvironment env, IDataView data, TransformInference.SuggestedTransform[] transforms) diff --git a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs index 0c3ec3b424..5fe46ec61e 100644 --- a/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs +++ b/src/Microsoft.ML.PipelineInference/Interfaces/IPipelineOptimizer.cs @@ -21,8 +21,7 @@ namespace Microsoft.ML.Runtime.PipelineInference /// public interface IPipelineOptimizer { - PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary columnPurpose); + PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, RoleMappedData dataRoles); void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, @@ -45,7 +44,7 @@ public abstract class PipelineOptimizerBase : IPipelineOptimizer protected IDataView OriginalData; protected IDataView FullyTransformedData; protected AutoInference.DependencyMap DependencyMapping; - protected Dictionary ColumnPurpose; + protected RoleMappedData DataRoles; protected readonly IHostEnvironment Env; protected readonly IHost Host; protected readonly Dictionary TransformsMaskValidity; @@ -62,8 +61,7 @@ protected PipelineOptimizerBase(IHostEnvironment env, IHost host) ProbUtils = new SweeperProbabilityUtils(host); } - public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, - Dictionary columnPurpose); + public abstract PipelinePattern[] GetNextCandidates(IEnumerable history, int numberOfCandidates, RoleMappedData dataRoles); public virtual void SetSpace(TransformInference.SuggestedTransform[] availableTransforms, RecipeInference.SuggestedRecipe.SuggestedLearner[] availableLearners, diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 9eb9b9ad4c..36661305ec 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -224,7 +224,7 @@ public static CommonOutputs.MacroOutput PipelineSweep( // Make sure search space is defined. If not, infer, // with default number of transform levels. if (!autoMlState.IsSearchSpaceDefined()) - autoMlState.InferSearchSpace(numTransformLevels: 1, columnPurpose); + autoMlState.InferSearchSpace(numTransformLevels: 1); // Extract performance summaries and assign to previous candidate pipelines. foreach (var pipeline in autoMlState.BatchCandidates) diff --git a/src/Microsoft.ML.PipelineInference/PurposeInference.cs b/src/Microsoft.ML.PipelineInference/PurposeInference.cs index e34f6bb808..2c181e1f9b 100644 --- a/src/Microsoft.ML.PipelineInference/PurposeInference.cs +++ b/src/Microsoft.ML.PipelineInference/PurposeInference.cs @@ -318,10 +318,10 @@ private static IEnumerable GetExperts() /// The data to use for inference. /// Indices of columns that we're interested in. /// Additional arguments to inference. - /// (Optional) User defined mapping of Column to Purpose. + /// (Optional) User defined Role mappings for data. /// The result includes the array of auto-detected column purposes. public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data, IEnumerable columnIndices, Arguments args, - Dictionary columnPurpose = null) + RoleMappedData dataRoles = null) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("InferPurposes"); @@ -335,16 +335,6 @@ public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data var cols = columnIndices.Select(x => new IntermediateColumn(takenData, x)).ToArray(); data = takenData; - // Instantiate with purpose provided in columnPurpose (if provided) - if (columnPurpose != null && columnPurpose.Count > 0) - { - foreach (var col in cols) - { - if (columnPurpose.ContainsKey(col.ColumnName)) - col.SuggestedPurpose = columnPurpose[col.ColumnName]; - } - } - foreach (var expert in GetExperts()) { using (var expertChannel = host.Start(expert.GetType().ToString())) diff --git a/src/Microsoft.ML.PipelineInference/TransformInference.cs b/src/Microsoft.ML.PipelineInference/TransformInference.cs index 971234f1d8..3028d5903b 100644 --- a/src/Microsoft.ML.PipelineInference/TransformInference.cs +++ b/src/Microsoft.ML.PipelineInference/TransformInference.cs @@ -1559,8 +1559,7 @@ public static InferenceResult InferTransforms(IHostEnvironment env, IDataView da } } - public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataView data, Arguments args, - Dictionary columnPurpose = null) + public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataView data, Arguments args, RoleMappedData dataRoles) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferTransforms"); @@ -1577,7 +1576,7 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi // Infer column purposes from data sample. var piArgs = new PurposeInference.Arguments { MaxRowsToRead = MaxRowsToRead }; var columnIndices = Enumerable.Range(0, dataSample.Schema.ColumnCount); - var piResult = PurposeInference.InferPurposes(env, dataSample, columnIndices, piArgs, columnPurpose); + var piResult = PurposeInference.InferPurposes(env, dataSample, columnIndices, piArgs, dataRoles); var purposes = piResult.Columns; // Infer transforms @@ -1596,8 +1595,7 @@ public static SuggestedTransform[] InferTransforms(IHostEnvironment env, IDataVi .Contains(t.AtomicGroupId)).ToArray(); } - public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args, - Dictionary columnPurpose) + public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment env, IDataView data, Arguments args, RoleMappedData dataRoles) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferConcatNumericFeatures"); @@ -1610,7 +1608,7 @@ public static SuggestedTransform[] InferConcatNumericFeatures(IHostEnvironment e // Infer column purposes from data sample. var piArgs = new PurposeInference.Arguments { MaxRowsToRead = MaxRowsToRead }; var columnIndices = Enumerable.Range(0, data.Schema.ColumnCount); - var piResult = PurposeInference.InferPurposes(env, data, columnIndices, piArgs, columnPurpose); + var piResult = PurposeInference.InferPurposes(env, data, columnIndices, piArgs, dataRoles); var purposes = piResult.Columns; var cols = purposes.Where(x => !data.Schema.IsHidden(x.ColumnIndex) diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index 1d850a9f8d..9504c72964 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -162,7 +162,7 @@ public void TestPipelineSweeperMacroColumnPurpose() MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, datasetTrain, datasetTest); // Infer search space - amls.InferSearchSpace(numTransformLevels, columnPurpose); + amls.InferSearchSpace(numTransformLevels); // Create macro object var pipelineSweepInput = new Microsoft.ML.Models.PipelineSweeper() From d4f61027637d904fb8808e15cde14d24294c2000 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 12 Jul 2018 01:31:49 +0000 Subject: [PATCH 08/10] taking care of review comments --- .../Macros/PipelineSweeperMacro.cs | 101 ++++--- .../PurposeInference.cs | 15 +- src/Microsoft.ML/CSharpApi.cs | 27 +- .../Common/EntryPoints/core_manifest.json | 42 +-- .../TestAutoInference.cs | 282 ++---------------- 5 files changed, 115 insertions(+), 352 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index 36661305ec..a24fdd7e27 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -37,31 +37,28 @@ public sealed class Arguments [Argument(ArgumentType.AtMostOnce, HelpText = "Output datasets from previous iteration of sweep.", SortOrder = 7, Hide = true)] public IDataView[] CandidateOutputs; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Ignore'", SortOrder = 8)] - public string[] IgnoreColumns; + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 8)] + public string[] LabelColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 9)] - public string[] NameColumns; + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 9)] + public string[] GroupIdColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 10)] - public string[] LabelColumns; + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 10)] + public string[] WeightColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 11)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 11)] + public string[] NameColumns; + + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 12)] public string[] NumericFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 12)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 13)] public string[] CategoricalFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 13)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 14)] public string[] TextFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 14)] - public string[] WeightColumns; - - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 15)] - public string[] GroupIdColumns; - - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 16)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 15)] public string[] ImagePathColumns; } @@ -115,65 +112,75 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input) return new Output { Results = outputView, State = autoMlState }; } - private static Dictionary GetColumnPurpose(IHostEnvironment env, Arguments input) + private static RoleMappedData GetDataRoles(IHostEnvironment env, Arguments input) { - var columnPurpose = new Dictionary(); - if (input.IgnoreColumns != null) + var roles = new List>(); + + if (input.LabelColumns != null) { - foreach (var colName in input.IgnoreColumns) - { - columnPurpose.Add(colName, ColumnPurpose.Ignore); - } + env.Check(input.LabelColumns.Length == 1, "LabelColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Label.Bind(input.LabelColumns[0])); } - if (input.NameColumns != null) + + if (input.GroupIdColumns != null) { - env.Check(input.NameColumns.Length == 1, "NameColumn expected one column name to be specified."); - columnPurpose.Add(input.NameColumns[0], ColumnPurpose.Name); + env.Check(input.GroupIdColumns.Length == 1, "GroupIdColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Group.Bind(input.GroupIdColumns[0])); } - if (input.LabelColumns != null) + + if (input.WeightColumns != null) { - env.Check(input.LabelColumns.Length == 1, "LabelColumn expected one column name to be specified."); - columnPurpose.Add(input.LabelColumns[0], ColumnPurpose.Label); + env.Check(input.WeightColumns.Length == 1, "WeightColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Weight.Bind(input.WeightColumns[0])); } + + if (input.NameColumns != null) + { + env.Check(input.NameColumns.Length == 1, "NameColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Name.Bind(input.NameColumns[0])); + } + if (input.NumericFeatureColumns != null) { + var numericFeature = new RoleMappedSchema.ColumnRole(ColumnPurpose.NumericFeature.ToString()); foreach (var colName in input.NumericFeatureColumns) { - columnPurpose.Add(colName, ColumnPurpose.NumericFeature); + var item = numericFeature.Bind(colName); + roles.Add(item); } } + if (input.CategoricalFeatureColumns != null) { + var categoricalFeature = new RoleMappedSchema.ColumnRole(ColumnPurpose.CategoricalFeature.ToString()); foreach (var colName in input.CategoricalFeatureColumns) { - columnPurpose.Add(colName, ColumnPurpose.CategoricalFeature); + var item = categoricalFeature.Bind(colName); + roles.Add(item); } } + if (input.TextFeatureColumns != null) { + var textFeature = new RoleMappedSchema.ColumnRole(ColumnPurpose.TextFeature.ToString()); foreach (var colName in input.TextFeatureColumns) { - columnPurpose.Add(colName, ColumnPurpose.TextFeature); + var item = textFeature.Bind(colName); + roles.Add(item); } } - if (input.WeightColumns != null) - { - env.Check(input.WeightColumns.Length == 1, "WeightColumn expected one column name to be specified."); - columnPurpose.Add(input.WeightColumns[0], ColumnPurpose.Weight); - } - if (input.GroupIdColumns != null) - { - env.Check(input.GroupIdColumns.Length == 1, "GroupIdColumn expected one column name to be specified."); - columnPurpose.Add(input.GroupIdColumns[0], ColumnPurpose.GroupId); - } + if (input.ImagePathColumns != null) { + var imagePath = new RoleMappedSchema.ColumnRole(ColumnPurpose.ImagePath.ToString()); foreach (var colName in input.ImagePathColumns) { - columnPurpose.Add(colName, ColumnPurpose.ImagePath); + var item = imagePath.Bind(colName); + roles.Add(item); } } - return columnPurpose; + + return new RoleMappedData(input.TrainingData, roles); } [TlcModule.EntryPoint(Desc = "AutoML pipeline sweeping optimzation macro.", Name = "Models.PipelineSweeper")] @@ -186,8 +193,8 @@ public static CommonOutputs.MacroOutput PipelineSweep( "Must have a valid AutoML State, or pass arguments to create one."); env.Check(input.BatchSize > 0, "Batch size must be > 0."); - // Get the user-defined column purposes (if any) - var columnPurpose = GetColumnPurpose(env, input); + // Get the user-defined column roles (if any) + var dataRoles = GetDataRoles(env, input); // If no current state, create object and set data. if (input.State == null) @@ -224,7 +231,7 @@ public static CommonOutputs.MacroOutput PipelineSweep( // Make sure search space is defined. If not, infer, // with default number of transform levels. if (!autoMlState.IsSearchSpaceDefined()) - autoMlState.InferSearchSpace(numTransformLevels: 1); + autoMlState.InferSearchSpace(numTransformLevels: 1, dataRoles); // Extract performance summaries and assign to previous candidate pipelines. foreach (var pipeline in autoMlState.BatchCandidates) diff --git a/src/Microsoft.ML.PipelineInference/PurposeInference.cs b/src/Microsoft.ML.PipelineInference/PurposeInference.cs index 2c181e1f9b..916644bcf7 100644 --- a/src/Microsoft.ML.PipelineInference/PurposeInference.cs +++ b/src/Microsoft.ML.PipelineInference/PurposeInference.cs @@ -332,14 +332,25 @@ public static InferenceResult InferPurposes(IHostEnvironment env, IDataView data using (var ch = host.Start("InferPurposes")) { var takenData = data.Take(args.MaxRowsToRead); - var cols = columnIndices.Select(x => new IntermediateColumn(takenData, x)).ToArray(); + var cols = columnIndices.Select(x => new IntermediateColumn(takenData, x)).ToList(); data = takenData; + if (dataRoles != null) + { + var items = dataRoles.Schema.GetColumnRoles(); + foreach(var item in items) + { + Enum.TryParse(item.Key.Value, out ColumnPurpose purpose); + var col = cols.Find(x => x.ColumnName == item.Value.Name); + col.SuggestedPurpose = purpose; + } + } + foreach (var expert in GetExperts()) { using (var expertChannel = host.Start(expert.GetType().ToString())) { - expert.Apply(expertChannel, cols); + expert.Apply(expertChannel, cols.ToArray()); expertChannel.Done(); } } diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index d44cb31dca..cb6e4ae612 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -3421,19 +3421,24 @@ public sealed partial class PipelineSweeper public ArrayVar CandidateOutputs { get; set; } = new ArrayVar(); /// - /// Column(s) to use as purpose 'Ignore' + /// Column(s) to use as purpose 'Label' /// - public string[] IgnoreColumns { get; set; } + public string[] LabelColumns { get; set; } /// - /// Column(s) to use as purpose 'Name' + /// Column(s) to use as purpose 'GroupId' /// - public string[] NameColumns { get; set; } + public string[] GroupIdColumns { get; set; } /// - /// Column(s) to use as purpose 'Label' + /// Column(s) to use as purpose 'Weight' /// - public string[] LabelColumns { get; set; } + public string[] WeightColumns { get; set; } + + /// + /// Column(s) to use as purpose 'Name' + /// + public string[] NameColumns { get; set; } /// /// Column(s) to use as purpose 'NumericFeature' @@ -3450,16 +3455,6 @@ public sealed partial class PipelineSweeper /// public string[] TextFeatureColumns { get; set; } - /// - /// Column(s) to use as purpose 'Weight' - /// - public string[] WeightColumns { get; set; } - - /// - /// Column(s) to use as purpose 'GroupId' - /// - public string[] GroupIdColumns { get; set; } - /// /// Column(s) to use as purpose 'ImagePath' /// diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 22f3391cfa..fb5832f4fe 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -2753,101 +2753,89 @@ "Default": null }, { - "Name": "IgnoreColumns", + "Name": "LabelColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Ignore'", + "Desc": "Column(s) to use as purpose 'Label'", "Required": false, "SortOrder": 8.0, "IsNullable": false, "Default": null }, { - "Name": "NameColumns", + "Name": "GroupIdColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Name'", + "Desc": "Column(s) to use as purpose 'GroupId'", "Required": false, "SortOrder": 9.0, "IsNullable": false, "Default": null }, { - "Name": "LabelColumns", + "Name": "WeightColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Label'", + "Desc": "Column(s) to use as purpose 'Weight'", "Required": false, "SortOrder": 10.0, "IsNullable": false, "Default": null }, { - "Name": "NumericFeatureColumns", + "Name": "NameColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'NumericFeature'", + "Desc": "Column(s) to use as purpose 'Name'", "Required": false, "SortOrder": 11.0, "IsNullable": false, "Default": null }, { - "Name": "CategoricalFeatureColumns", + "Name": "NumericFeatureColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'CategoricalFeature'", + "Desc": "Column(s) to use as purpose 'NumericFeature'", "Required": false, "SortOrder": 12.0, "IsNullable": false, "Default": null }, { - "Name": "TextFeatureColumns", + "Name": "CategoricalFeatureColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'TextFeature'", + "Desc": "Column(s) to use as purpose 'CategoricalFeature'", "Required": false, "SortOrder": 13.0, "IsNullable": false, "Default": null }, { - "Name": "WeightColumns", + "Name": "TextFeatureColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Weight'", + "Desc": "Column(s) to use as purpose 'TextFeature'", "Required": false, "SortOrder": 14.0, "IsNullable": false, "Default": null }, - { - "Name": "GroupIdColumns", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Column(s) to use as purpose 'GroupId'", - "Required": false, - "SortOrder": 15.0, - "IsNullable": false, - "Default": null - }, { "Name": "ImagePathColumns", "Type": { @@ -2856,7 +2844,7 @@ }, "Desc": "Column(s) to use as purpose 'ImagePath'", "Required": false, - "SortOrder": 16.0, + "SortOrder": 15.0, "IsNullable": false, "Default": null } diff --git a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs index 9504c72964..4d8ec880d1 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs @@ -125,81 +125,6 @@ public void TestPipelineSweeperMacroNoTransforms() Assert.True(amlsOut.GetBestPipeline().PerformanceSummary.MetricValue > 0.1); } - - [Fact] - [TestCategory("EntryPoints")] - public void TestPipelineSweeperMacroColumnPurpose() - { - // Set up inputs for experiment - string pathData = GetDataPath("adult.train"); - string pathDataTest = GetDataPath("adult.test"); - const int numOfSampleRows = 1000; - const string schema = "sep=, col=F0:R4:0 col=F2:R4:2 col=F4:R4:4 col=F1012:R4:10-12 col=F14:R4:14 header=+"; - - var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); -#pragma warning disable 0618 - var datasetTrain = ImportTextData.ImportText(Env, - new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - var datasetTest = ImportTextData.ImportText(Env, - new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); -#pragma warning restore 0618 - const int batchSize = 5; - const int numIterations = 1; - const int numTransformLevels = 2; - AutoInference.SupportedMetric metric = AutoInference.SupportedMetric.Auc; - var columnPurpose = new Dictionary() { - { "F14", ColumnPurpose.Label }, - { "F0", ColumnPurpose.Ignore }, - { "F2", ColumnPurpose.Ignore }, - }; - - // Using the simple, uniform random sampling (with replacement) engine - PipelineOptimizerBase autoMlEngine = new DefaultsEngine(Env, null); - - // Create search object - var amls = new AutoInference.AutoMlMlState(Env, metric, autoMlEngine, new IterationTerminator(numIterations), - MacroUtils.TrainerKinds.SignatureBinaryClassifierTrainer, datasetTrain, datasetTest); - - // Infer search space - amls.InferSearchSpace(numTransformLevels); - - // Create macro object - var pipelineSweepInput = new Microsoft.ML.Models.PipelineSweeper() - { - BatchSize = batchSize, - }; - - var exp = new Experiment(Env); - var output = exp.Add(pipelineSweepInput); - exp.Compile(); - exp.SetInput(pipelineSweepInput.TrainingData, datasetTrain); - exp.SetInput(pipelineSweepInput.TestingData, datasetTest); - exp.SetInput(pipelineSweepInput.State, amls); - exp.SetInput(pipelineSweepInput.CandidateOutputs, new IDataView[0]); - exp.Run(); - - // Verify Results. - var results = exp.GetOutput(output.Results); - var rows = PipelinePattern.ExtractResults(Env, results, "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); - Assert.True(rows.Length == numIterations); - Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); - - var bestPipelineJsonGraph = rows[0].GraphJson; - JObject bestPipeline = JObject.Parse(bestPipelineJsonGraph); - - var label = bestPipeline["Nodes"][0]["Inputs"]["Column"][0]["Source"]; - Assert.Equal("F14", label); // Check F14 was picked as Label - - var features = (JArray)bestPipeline["Nodes"][1]["Inputs"]["Column"][0]["Source"]; - Assert.Equal(2, features.Count); // Check we have two features, F4 and F1012. - var features1 = features[0]; - var features2 = features[1]; - Assert.Equal("F4", features1); - Assert.Equal("F1012", features2); - } - - [Fact] [TestCategory("EntryPoints")] public void EntryPointPipelineSweepSerialization() @@ -349,100 +274,16 @@ public void EntryPointPipelineSweep() [Fact] [TestCategory("EntryPoints")] - public void EntryPointPipelineSweepColumnPurposeDefaults() - { - // Get datasets - var pathData = GetDataPath("adult.train"); - var pathDataTest = GetDataPath("adult.test"); - const int numOfSampleRows = 1000; - int numIterations = 4; - const string schema = - "sep=, col=age:R4:0 col=fnlwgt:R4:2 col=education_num:R4:4 col=Features:R4:10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + - "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=IsOver50K:R4:14 header=+"; - var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); -#pragma warning disable 0618 - var datasetTrain = ImportTextData.ImportText(Env, - new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - var datasetTest = ImportTextData.ImportText(Env, - new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); -#pragma warning restore 0618 - // Define entrypoint graph - string inputGraph = @" - { - 'Nodes': [ - { - 'Name': 'Models.PipelineSweeper', - 'Inputs': { - 'TrainingData': '$TrainingData', - 'TestingData': '$TestingData', - 'IgnoreColumns': ['age', 'fnlwgt', 'education_num', 'native_country'], - 'LabelColumns': ['IsOver50K'], - 'StateArguments': { - 'Name': 'AutoMlState', - 'Settings': { - 'Metric': 'Auc', - 'Engine': { - 'Name': 'Defaults' - }, - 'TerminatorArgs': { - 'Name': 'IterationLimited', - 'Settings': { - 'FinalHistoryLength': 4 - } - }, - 'TrainerKind': 'SignatureBinaryClassifierTrainer' - } - }, - 'BatchSize': 2 - }, - 'Outputs': { - 'State': '$StateOut', - 'Results': '$ResultsOut' - } - }, - ] - }"; - - JObject graph = JObject.Parse(inputGraph); - var catalog = ModuleCatalog.CreateInstance(Env); - - var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); - runner.SetInput("TrainingData", datasetTrain); - runner.SetInput("TestingData", datasetTest); - runner.RunAll(); - - var autoMlState = runner.GetOutput("StateOut"); - Assert.NotNull(autoMlState); - var allPipelines = autoMlState.GetAllEvaluatedPipelines(); - var bestPipeline = autoMlState.GetBestPipeline(); - Assert.Equal(allPipelines.Length, numIterations); - - var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; - var testAuc = bestPipeline.PerformanceSummary.MetricValue; - Assert.True((0.91 < trainAuc) && (trainAuc < 0.92)); - Assert.True((0.87 < testAuc) && (testAuc < 0.88)); - - var results = runner.GetOutput("ResultsOut"); - Assert.NotNull(results); - var rows = PipelinePattern.ExtractResults(Env, results, - "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); - Assert.True(rows.Length == numIterations); - Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); - } - - [Fact] - [TestCategory("EntryPoints")] - public void EntryPointPipelineSweepColumnPurposeRocket() + public void EntryPointPipelineSweepRoles() { // Get datasets var pathData = GetDataPath("adult.train"); var pathDataTest = GetDataPath("adult.test"); - const int numOfSampleRows = 1000; - int numIterations = 4; + const int numOfSampleRows = 100; + int numIterations = 2; const string schema = - "sep=, col=age:R4:0 col=fnlwgt:R4:2 col=education_num:R4:4 col=Features:R4:10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + - "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=IsOver50K:R4:14 header=+"; + "sep=, col=age:R4:0 col=workclass:TX:1 col=fnlwgt:R4:2 col=education:TX:3 col=education_num:R4:4 col=marital_status:TX:5 col=occupation:TX:6 " + + "col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=Features:R4:10-12 col=native_country:TX:13 col=IsOver50K_:R4:14 header=+"; var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); #pragma warning disable 0618 var datasetTrain = ImportTextData.ImportText(Env, @@ -451,119 +292,41 @@ public void EntryPointPipelineSweepColumnPurposeRocket() var datasetTest = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); #pragma warning restore 0618 - // Define entrypoint graph - string inputGraph = @" - { - 'Nodes': [ - { - 'Name': 'Models.PipelineSweeper', - 'Inputs': { - 'TrainingData': '$TrainingData', - 'TestingData': '$TestingData', - 'IgnoreColumns': ['age', 'fnlwgt', 'education_num', 'native_country'], - 'LabelColumns': ['IsOver50K'], - 'StateArguments': { - 'Name': 'AutoMlState', - 'Settings': { - 'Metric': 'Auc', - 'Engine': { - 'Name': 'Rocket' - }, - 'TerminatorArgs': { - 'Name': 'IterationLimited', - 'Settings': { - 'FinalHistoryLength': 4 - } - }, - 'TrainerKind': 'SignatureBinaryClassifierTrainer' - } - }, - 'BatchSize': 2 - }, - 'Outputs': { - 'State': '$StateOut', - 'Results': '$ResultsOut' - } - }, - ] - }"; - - JObject graph = JObject.Parse(inputGraph); - var catalog = ModuleCatalog.CreateInstance(Env); - var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); - runner.SetInput("TrainingData", datasetTrain); - runner.SetInput("TestingData", datasetTest); - runner.RunAll(); - - var autoMlState = runner.GetOutput("StateOut"); - Assert.NotNull(autoMlState); - var allPipelines = autoMlState.GetAllEvaluatedPipelines(); - var bestPipeline = autoMlState.GetBestPipeline(); - Assert.Equal(allPipelines.Length, numIterations); - - var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; - var testAuc = bestPipeline.PerformanceSummary.MetricValue; - Assert.True((0.91 < trainAuc) && (trainAuc < 0.92)); - Assert.True((0.87 < testAuc) && (testAuc < 0.88)); - - var results = runner.GetOutput("ResultsOut"); - Assert.NotNull(results); - var rows = PipelinePattern.ExtractResults(Env, results, - "Graph", "MetricValue", "PipelineId", "TrainingMetricValue", "FirstInput", "PredictorModel"); - Assert.True(rows.Length == numIterations); - Assert.True(rows.All(r => r.TrainingMetricValue > 0.1)); - } - - [Fact] - [TestCategory("EntryPoints")] - public void EntryPointPipelineSweepColumnPurposeUniformRandom() - { - // Get datasets - var pathData = GetDataPath("adult.train"); - var pathDataTest = GetDataPath("adult.test"); - const int numOfSampleRows = 1000; - int numIterations = 4; - const string schema = -"sep=, col=age:R4:0 col=fnlwgt:R4:2 col=education_num:R4:4 col=Features:R4:10-12 col=workclass:TX:1 col=education:TX:3 col=marital_status:TX:5 col=occupation:TX:6 " + -"col=relationship:TX:7 col=ethnicity:TX:8 col=sex:TX:9 col=native_country:TX:13 col=IsOver50K:R4:14 header=+"; - - var inputFileTrain = new SimpleFileHandle(Env, pathData, false, false); -#pragma warning disable 0618 - var datasetTrain = ImportTextData.ImportText(Env, - new ImportTextData.Input { InputFile = inputFileTrain, CustomSchema = schema }).Data.Take(numOfSampleRows); - var inputFileTest = new SimpleFileHandle(Env, pathDataTest, false, false); - var datasetTest = ImportTextData.ImportText(Env, - new ImportTextData.Input { InputFile = inputFileTest, CustomSchema = schema }).Data.Take(numOfSampleRows); -#pragma warning restore 0618 // Define entrypoint graph string inputGraph = @" { - 'Nodes': [ + 'Nodes': [ { 'Name': 'Models.PipelineSweeper', 'Inputs': { 'TrainingData': '$TrainingData', 'TestingData': '$TestingData', - 'IgnoreColumns': ['age', 'fnlwgt', 'education_num', 'native_country'], - 'LabelColumns': ['IsOver50K'], + 'LabelColumns': ['IsOver50K_'], + 'WeightColumns': ['education_num'], + 'NameColumns': ['education'], + 'TextFeatureColumns': ['workclass', 'marital_status', 'occupation'], 'StateArguments': { 'Name': 'AutoMlState', 'Settings': { 'Metric': 'Auc', 'Engine': { - 'Name': 'UniformRandom' + 'Name': 'Defaults' }, 'TerminatorArgs': { 'Name': 'IterationLimited', 'Settings': { - 'FinalHistoryLength': 4 + 'FinalHistoryLength': 2 } }, - 'TrainerKind': 'SignatureBinaryClassifierTrainer' + 'TrainerKind': 'SignatureBinaryClassifierTrainer', + 'RequestedLearners' : [ + 'LogisticRegressionBinaryClassifier', + 'FastTreeBinaryClassifier' + ] } }, - 'BatchSize': 2 + 'BatchSize': 1 }, 'Outputs': { 'State': '$StateOut', @@ -573,10 +336,9 @@ public void EntryPointPipelineSweepColumnPurposeUniformRandom() ] }"; - JObject graph = JObject.Parse(inputGraph); + JObject graphJson = JObject.Parse(inputGraph); var catalog = ModuleCatalog.CreateInstance(Env); - - var runner = new GraphRunner(Env, catalog, graph[FieldNames.Nodes] as JArray); + var runner = new GraphRunner(Env, catalog, graphJson[FieldNames.Nodes] as JArray); runner.SetInput("TrainingData", datasetTrain); runner.SetInput("TestingData", datasetTest); runner.RunAll(); @@ -589,8 +351,8 @@ public void EntryPointPipelineSweepColumnPurposeUniformRandom() var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue; var testAuc = bestPipeline.PerformanceSummary.MetricValue; - Assert.True((0.91 < trainAuc) && (trainAuc < 0.92)); - Assert.True((0.86 < testAuc) && (testAuc < 0.87)); + Assert.True((0.94 < trainAuc) && (trainAuc < 0.95)); + Assert.True((0.83 < testAuc) && (testAuc < 0.84)); var results = runner.GetOutput("ResultsOut"); Assert.NotNull(results); From 99172e2b77488fbaf654dc2bd580695e357f8b9f Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 12 Jul 2018 20:18:34 +0000 Subject: [PATCH 09/10] using pipeline.UniqueId --- src/Microsoft.ML.PipelineInference/AutoInference.cs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/AutoInference.cs b/src/Microsoft.ML.PipelineInference/AutoInference.cs index 4ff8fe510b..73a358865f 100644 --- a/src/Microsoft.ML.PipelineInference/AutoInference.cs +++ b/src/Microsoft.ML.PipelineInference/AutoInference.cs @@ -205,7 +205,6 @@ public sealed class AutoMlMlState : IMlState private IDataView _transformedData; private ITerminator _terminator; private string[] _requestedLearners; - private int _pipelineId; private TransformInference.SuggestedTransform[] _availableTransforms; private RecipeInference.SuggestedRecipe.SuggestedLearner[] _availableLearners; private DependencyMap _dependencyMapping; @@ -546,11 +545,10 @@ public PipelinePattern[] GetNextCandidates(int numberOfCandidates) { foreach (var pipeline in BatchCandidates) { - ch.Info($"AutoInference Pipeline : {_pipelineId++}"); - int transformK = 0; + ch.Info($"AutoInference Pipeline Id : {pipeline.UniqueId}"); foreach (var transform in pipeline.Transforms) { - ch.Info($"AutoInference Transform {transformK++} : {transform.Transform}"); + ch.Info($"AutoInference Transform : {transform.Transform}"); } ch.Info($"AutoInference Learner : {pipeline.Learner}"); } From fb5ca798f2b446057850413f403b152b9126d599 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 12 Jul 2018 21:29:41 +0000 Subject: [PATCH 10/10] taking care of review comments. update ColumnPurpose 'Group' so it is consistent with Role 'Group' --- .../InferenceUtils.cs | 4 ++-- .../Macros/PipelineSweeperMacro.cs | 24 +++++++++---------- .../PurposeInference.cs | 6 ++--- .../TransformInference.cs | 2 +- src/Microsoft.ML/CSharpApi.cs | 18 +++++++------- .../Common/EntryPoints/core_manifest.json | 18 +++++++------- 6 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/Microsoft.ML.PipelineInference/InferenceUtils.cs b/src/Microsoft.ML.PipelineInference/InferenceUtils.cs index 8f31a792eb..311e98e75d 100644 --- a/src/Microsoft.ML.PipelineInference/InferenceUtils.cs +++ b/src/Microsoft.ML.PipelineInference/InferenceUtils.cs @@ -83,7 +83,7 @@ public static Type InferPredictorCategoryType(IDataView data, PurposeInference.C label.ItemKind == DataKind.TX || data.Schema.GetColumnType(label.ColumnIndex).IsKey) { - if (columns.Any(col => col.Purpose == ColumnPurpose.GroupId)) + if (columns.Any(col => col.Purpose == ColumnPurpose.Group)) return typeof(SignatureRankerTrainer); else return typeof(SignatureMultiClassClassifierTrainer); @@ -177,7 +177,7 @@ public enum ColumnPurpose CategoricalFeature = 4, TextFeature = 5, Weight = 6, - GroupId = 7, + Group = 7, ImagePath = 8 } } diff --git a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs index a24fdd7e27..be51c1e695 100644 --- a/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs +++ b/src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs @@ -37,28 +37,28 @@ public sealed class Arguments [Argument(ArgumentType.AtMostOnce, HelpText = "Output datasets from previous iteration of sweep.", SortOrder = 7, Hide = true)] public IDataView[] CandidateOutputs; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Label'", SortOrder = 8)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Label'", SortOrder = 8)] public string[] LabelColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'GroupId'", SortOrder = 9)] - public string[] GroupIdColumns; + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Group'", SortOrder = 9)] + public string[] GroupColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Weight'", SortOrder = 10)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Weight'", SortOrder = 10)] public string[] WeightColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'Name'", SortOrder = 11)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'Name'", SortOrder = 11)] public string[] NameColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'NumericFeature'", SortOrder = 12)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'NumericFeature'", SortOrder = 12)] public string[] NumericFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'CategoricalFeature'", SortOrder = 13)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'CategoricalFeature'", SortOrder = 13)] public string[] CategoricalFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'TextFeature'", SortOrder = 14)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'TextFeature'", SortOrder = 14)] public string[] TextFeatureColumns; - [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as purpose 'ImagePath'", SortOrder = 15)] + [Argument(ArgumentType.MultipleUnique, HelpText = "Column(s) to use as Role 'ImagePath'", SortOrder = 15)] public string[] ImagePathColumns; } @@ -122,10 +122,10 @@ private static RoleMappedData GetDataRoles(IHostEnvironment env, Arguments input roles.Add(RoleMappedSchema.ColumnRole.Label.Bind(input.LabelColumns[0])); } - if (input.GroupIdColumns != null) + if (input.GroupColumns != null) { - env.Check(input.GroupIdColumns.Length == 1, "GroupIdColumns expected one column name to be specified."); - roles.Add(RoleMappedSchema.ColumnRole.Group.Bind(input.GroupIdColumns[0])); + env.Check(input.GroupColumns.Length == 1, "GroupColumns expected one column name to be specified."); + roles.Add(RoleMappedSchema.ColumnRole.Group.Bind(input.GroupColumns[0])); } if (input.WeightColumns != null) diff --git a/src/Microsoft.ML.PipelineInference/PurposeInference.cs b/src/Microsoft.ML.PipelineInference/PurposeInference.cs index 916644bcf7..b870b392f3 100644 --- a/src/Microsoft.ML.PipelineInference/PurposeInference.cs +++ b/src/Microsoft.ML.PipelineInference/PurposeInference.cs @@ -147,9 +147,9 @@ public void Apply(IChannel ch, IntermediateColumn[] columns) else if (Regex.IsMatch(column.ColumnName, @"^m_rating$", RegexOptions.IgnoreCase)) column.SuggestedPurpose = ColumnPurpose.Label; else if (Regex.IsMatch(column.ColumnName, @"^m_queryid$", RegexOptions.IgnoreCase)) - column.SuggestedPurpose = ColumnPurpose.GroupId; - else if (Regex.IsMatch(column.ColumnName, @"groupid", RegexOptions.IgnoreCase)) - column.SuggestedPurpose = ColumnPurpose.GroupId; + column.SuggestedPurpose = ColumnPurpose.Group; + else if (Regex.IsMatch(column.ColumnName, @"group", RegexOptions.IgnoreCase)) + column.SuggestedPurpose = ColumnPurpose.Group; else if (Regex.IsMatch(column.ColumnName, @"^m_\w+id$", RegexOptions.IgnoreCase)) column.SuggestedPurpose = ColumnPurpose.Name; else if (Regex.IsMatch(column.ColumnName, @"^id$", RegexOptions.IgnoreCase)) diff --git a/src/Microsoft.ML.PipelineInference/TransformInference.cs b/src/Microsoft.ML.PipelineInference/TransformInference.cs index a55836bacf..d0475f8637 100644 --- a/src/Microsoft.ML.PipelineInference/TransformInference.cs +++ b/src/Microsoft.ML.PipelineInference/TransformInference.cs @@ -417,7 +417,7 @@ public sealed class GroupIdHashRename : TransformInferenceExpertBase { public override IEnumerable Apply(IntermediateColumn[] columns, Arguments inferenceArgs, IChannel ch) { - var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.GroupId); + var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.Group); if (firstGroupColId < 0) yield break; diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index cb6e4ae612..e5a7741ddc 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -3421,42 +3421,42 @@ public sealed partial class PipelineSweeper public ArrayVar CandidateOutputs { get; set; } = new ArrayVar(); /// - /// Column(s) to use as purpose 'Label' + /// Column(s) to use as Role 'Label' /// public string[] LabelColumns { get; set; } /// - /// Column(s) to use as purpose 'GroupId' + /// Column(s) to use as Role 'Group' /// - public string[] GroupIdColumns { get; set; } + public string[] GroupColumns { get; set; } /// - /// Column(s) to use as purpose 'Weight' + /// Column(s) to use as Role 'Weight' /// public string[] WeightColumns { get; set; } /// - /// Column(s) to use as purpose 'Name' + /// Column(s) to use as Role 'Name' /// public string[] NameColumns { get; set; } /// - /// Column(s) to use as purpose 'NumericFeature' + /// Column(s) to use as Role 'NumericFeature' /// public string[] NumericFeatureColumns { get; set; } /// - /// Column(s) to use as purpose 'CategoricalFeature' + /// Column(s) to use as Role 'CategoricalFeature' /// public string[] CategoricalFeatureColumns { get; set; } /// - /// Column(s) to use as purpose 'TextFeature' + /// Column(s) to use as Role 'TextFeature' /// public string[] TextFeatureColumns { get; set; } /// - /// Column(s) to use as purpose 'ImagePath' + /// Column(s) to use as Role 'ImagePath' /// public string[] ImagePathColumns { get; set; } diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index fb5832f4fe..40ca58ddd5 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -2758,19 +2758,19 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Label'", + "Desc": "Column(s) to use as Role 'Label'", "Required": false, "SortOrder": 8.0, "IsNullable": false, "Default": null }, { - "Name": "GroupIdColumns", + "Name": "GroupColumns", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'GroupId'", + "Desc": "Column(s) to use as Role 'Group'", "Required": false, "SortOrder": 9.0, "IsNullable": false, @@ -2782,7 +2782,7 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Weight'", + "Desc": "Column(s) to use as Role 'Weight'", "Required": false, "SortOrder": 10.0, "IsNullable": false, @@ -2794,7 +2794,7 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'Name'", + "Desc": "Column(s) to use as Role 'Name'", "Required": false, "SortOrder": 11.0, "IsNullable": false, @@ -2806,7 +2806,7 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'NumericFeature'", + "Desc": "Column(s) to use as Role 'NumericFeature'", "Required": false, "SortOrder": 12.0, "IsNullable": false, @@ -2818,7 +2818,7 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'CategoricalFeature'", + "Desc": "Column(s) to use as Role 'CategoricalFeature'", "Required": false, "SortOrder": 13.0, "IsNullable": false, @@ -2830,7 +2830,7 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'TextFeature'", + "Desc": "Column(s) to use as Role 'TextFeature'", "Required": false, "SortOrder": 14.0, "IsNullable": false, @@ -2842,7 +2842,7 @@ "Kind": "Array", "ItemType": "String" }, - "Desc": "Column(s) to use as purpose 'ImagePath'", + "Desc": "Column(s) to use as Role 'ImagePath'", "Required": false, "SortOrder": 15.0, "IsNullable": false,