From fce8ef40c494266fcd2a404ff4a6c3d93336d84e Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 10 Apr 2019 13:39:36 -0700 Subject: [PATCH 1/8] Adding Regression GAM samples. --- .../Dynamic/Trainers/Regression/Gam.cs | 189 +++++++++++------- .../Trainers/Regression/GamWithOptions.cs | 180 +++++++++++------ 2 files changed, 233 insertions(+), 136 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs index dd8107452a..be545c86dc 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs @@ -1,7 +1,7 @@ using System; -using System.Linq; +using System.Collections.Generic; using Microsoft.ML; -using Microsoft.ML.SamplesUtils; +using Microsoft.ML.Data; namespace Samples.Dynamic.Trainers.Regression { @@ -15,34 +15,32 @@ public static void Example() // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Read the Housing regression dataset - var data = DatasetUtils.LoadHousingRegressionDataset(mlContext); - - var labelName = "MedianHomeValue"; - var featureNames = data.Schema - .Select(column => column.Name) // Get the column names - .Where(name => name != labelName) // Drop the Label - .ToArray(); - - // Create a pipeline. - var pipeline = - // Concatenate the features to create a Feature vector. - mlContext.Transforms.Concatenate("Features", featureNames) - // Append a GAM regression trainer, setting the "MedianHomeValue" column as the label of the dataset, - // the "Features" column produced by concatenation as the features column, - // and use a small number of bins to make it easy to visualize in the console window. - // For real applications, it is recommended to start with the default number of bins. - .Append(mlContext.Regression.Trainers.Gam(labelColumnName: labelName, featureColumnName: "Features", maximumBinCountPerFeature: 16)); - - // Train the pipeline. - var trainedPipeline = pipeline.Fit(data); - - // Extract the model from the pipeline. - var gamModel = trainedPipeline.LastTransformer.Model; + // Create training and validation datasets. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create train and set set. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. + var trainer = mlContext.Regression.Trainers.Gam(maximumBinCountPerFeature: 16); + + // Fit the model to the data using a validation set. + // GAM will use a technique called validation pruning to tune the model after training + // to improve generalization. + var model = trainer.Fit(trainSet, validSet); + + // Extract the model parameters. + var gam = model.Model; // Now investigate the bias and shape functions of the GAM model. // The bias represents the average prediction for the training data. - Console.WriteLine($"Average predicted cost: {gamModel.Bias:0.00}"); + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); // Expected output: // Average predicted cost: 22.53 @@ -51,56 +49,105 @@ public static void Example() // one response per feature. Unlike a linear model, this response is a function instead of a line. // Each feature response represents the deviation from the average prediction as a function of the // feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + // Get the bin effects; these are the effect size for each bin. + var binEffects = gam.GetBinEffects(i); - // Let's investigate the TeacherRatio variable. This is the ratio of students to teachers, - // so the higher it is, the more students a teacher has in their classroom. - // First, let's get the index of the variable we want to look at. - var studentTeacherRatioIndex = featureNames.ToList().FindIndex(str => str.Equals("TeacherRatio")); - - // Next, let's get the array of histogram bin upper bounds from the model for this feature. - // For each feature, the shape function is calculated at `MaxBins` locations along the range of - // values that the feature takes, and the resulting shape function can be seen as a histogram of - // effects. - var teacherRatioBinUpperBounds = gamModel.GetBinUpperBounds(studentTeacherRatioIndex); - // And the array of bin effects; these are the effect size for each bin. - var teacherRatioBinEffects = gamModel.GetBinEffects(studentTeacherRatioIndex); - - // Now, write the function to the console. The function is a set of bins, and the corresponding - // function values. You can think of GAMs as building a bar-chart lookup table. - Console.WriteLine("Student-Teacher Ratio"); - for (int i = 0; i < teacherRatioBinUpperBounds.Count; i++) - Console.WriteLine($"x < {teacherRatioBinUpperBounds[i]:0.00} => {teacherRatioBinEffects[i]:0.000}"); - - // Expected output: - // Student-Teacher Ratio - // x < 14.55 => 2.105 - // x < 14.75 => 2.326 - // x < 15.40 => 0.903 - // x < 16.50 => 0.651 - // x < 17.15 => 0.587 - // x < 17.70 => 0.624 - // x < 17.85 => 0.684 - // x < 18.35 => -0.315 - // x < 18.55 => -0.542 - // x < 18.75 => -0.083 - // x < 19.40 => -0.442 - // x < 20.55 => -0.649 - // x < 21.05 => -1.579 - // x < ∞ => 0.318 + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart lookup table. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } + + // Expected output: + // Average prediction: 1.33 + // + // Feature0 + // x < -0.44 => 0.128 + // x < -0.38 => 0.066 + // x < -0.32 => 0.040 + // x < -0.26 => -0.006 + // x < -0.20 => -0.035 + // x < -0.13 => -0.050 + // x < 0.06 => -0.077 + // x < 0.12 => -0.075 + // x < 0.18 => -0.052 + // x < 0.25 => -0.031 + // x < 0.31 => -0.002 + // x < 0.37 => 0.040 + // x < 0.44 => 0.083 + // x < ∞ => 0.123 + + // Feature1 + // x < 0.00 => -0.245 + // x < 0.06 => 0.671 + // x < 0.24 => 0.723 + // x < 0.31 => -0.141 + // x < 0.37 => -0.241 + // x < ∞ => -0.248 // Let's consider this output. To score a given example, we look up the first bin where the inequality // is satisfied for the feature value. We can look at the whole function to get a sense for how the - // model responds to the variable on a global level. For the student-teacher-ratio variable, we can see - // that smaller class sizes are predictive of a higher house value, while student-teacher ratios higher - // than about 18 lead to lower predictions in house value. This makes intuitive sense, as smaller class - // sizes are desirable and also indicative of better-funded schools, which could make buyers likely to - // pay more for the house. - - // Another thing to notice is that these feature functions can be noisy. See student-teacher ratios > 21.05. - // Common practice is to use resampling methods to estimate a confidence interval at each bin. This will - // help to determine if the effect is real or just sampling noise. See for example + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model // Distillation." arXiv:1710.06169." } + + private class Data + { + public float Label { get; set; } + [VectorType(2)] + public float[] Features { get; set; } + } + + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) + { + float bias = 1.0f; + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) + { + // Generate random, uncoupled features. + var data = new Data { + Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + }; + // Compute the label from the shape functions and add noise. + data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5); + + yield return data; + } + } + + private static float Parabola(float x) => x * x; + + private static float SimplePiecewise(float x) + { + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; + } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs index 33617b2d94..45e69e3562 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Trainers.FastTree; @@ -15,92 +14,143 @@ public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. - // Setting the seed to a fixed number in this example to make outputs deterministic. - var mlContext = new MLContext(seed: 0); + var mlContext = new MLContext(); - // Create a list of training examples. - var examples = GenerateRandomDataPoints(1000); + // Create training and validation datasets. + var samples = GenerateData(); - // Convert the examples list to an IDataView object, which is consumable by ML.NET API. - var trainingData = mlContext.Data.LoadFromEnumerable(examples); + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); - // Define trainer options. - var options = new GamRegressionTrainer.Options - { - // The entropy (regularization) coefficient. - EntropyCoefficient = 0.3, - // Reduce the number of iterations to 50. - NumberOfIterations = 50 - }; - - // Define the trainer. - var pipeline = mlContext.Regression.Trainers.Gam(options); + // Create train and set set. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; - // Train the model. - var model = pipeline.Fit(trainingData); + // Create a GAM trainer. + // Use a small number of bins for this example. + var trainer = mlContext.Regression.Trainers.Gam( + new GamRegressionTrainer.Options { MaximumBinCountPerFeature = 16 }); - // Create testing examples. Use different random seed to make it different from training data. - var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed:123)); + // Fit the model to the data using a validation set. + // GAM will use a technique called validation pruning to tune the model after training + // to improve generalization. + var model = trainer.Fit(trainSet, validSet); - // Run the model on test data set. - var transformedTestData = model.Transform(testData); + // Extract the model parameters. + var gam = model.Model; - // Convert IDataView object to a list. - var predictions = mlContext.Data.CreateEnumerable(transformedTestData, reuseRowObject: false).ToList(); - - // Look at 5 predictions - foreach (var p in predictions.Take(5)) - Console.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}"); + // Now investigate the bias and shape functions of the GAM model. + // The bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); // Expected output: - // Label: 0.985, Prediction: 0.841 - // Label: 0.155, Prediction: 0.187 - // Label: 0.515, Prediction: 0.496 - // Label: 0.566, Prediction: 0.467 - // Label: 0.096, Prediction: 0.144 + // Average predicted cost: 22.53 - // Evaluate the overall metrics - var metrics = mlContext.Regression.Evaluate(transformedTestData); - Microsoft.ML.SamplesUtils.ConsoleUtils.PrintMetrics(metrics); + // Let's take a look at the features that the model built. Similar to a linear model, we have + // one response per feature. Unlike a linear model, this response is a function instead of a line. + // Each feature response represents the deviation from the average prediction as a function of the + // feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + // Get the bin effects; these are the effect size for each bin. + var binEffects = gam.GetBinEffects(i); + + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart lookup table. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } // Expected output: - // Mean Absolute Error: 0.06 - // Mean Squared Error: 0.01 - // Root Mean Squared Error: 0.08 - // RSquared: 0.93 + // Average prediction: 1.33 + // + // Feature0 + // x < -0.44 => 0.128 + // x < -0.38 => 0.066 + // x < -0.32 => 0.040 + // x < -0.26 => -0.006 + // x < -0.20 => -0.035 + // x < -0.13 => -0.050 + // x < 0.06 => -0.077 + // x < 0.12 => -0.075 + // x < 0.18 => -0.052 + // x < 0.25 => -0.031 + // x < 0.31 => -0.002 + // x < 0.37 => 0.040 + // x < 0.44 => 0.083 + // x < ∞ => 0.123 + + // Feature1 + // x < 0.00 => -0.245 + // x < 0.06 => 0.671 + // x < 0.24 => 0.723 + // x < 0.31 => -0.141 + // x < 0.37 => -0.241 + // x < ∞ => -0.248 + + // Let's consider this output. To score a given example, we look up the first bin where the inequality + // is satisfied for the feature value. We can look at the whole function to get a sense for how the + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: + // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model + // Distillation." arXiv:1710.06169." + } + + private class Data + { + public float Label { get; set; } + [VectorType(2)] + public float[] Features { get; set; } } - private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) { - var random = new Random(seed); - float randomFloat() => (float)random.NextDouble(); - for (int i = 0; i < count; i++) + float bias = 1.0f; + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) { - var label = randomFloat(); - yield return new DataPoint + // Generate random, uncoupled features. + var data = new Data { - Label = label, - // Create random features that are correlated with label. - Features = Enumerable.Repeat(label, 50).Select(x => x + randomFloat()).ToArray() + Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } }; + // Compute the label from the shape functions and add noise. + data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5); + + yield return data; } } - // Example with label and 50 feature values. A data set is a collection of such examples. - private class DataPoint - { - public float Label { get; set; } - [VectorType(50)] - public float[] Features { get; set; } - } + private static float Parabola(float x) => x * x; - // Class used to capture predictions. - private class Prediction + private static float SimplePiecewise(float x) { - // Original label. - public float Label { get; set; } - // Predicted score from the trainer. - public float Score { get; set; } + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; } } -} \ No newline at end of file +} From 7d335d05fb766104a20b6a43309698d76ac8116b Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 10 Apr 2019 15:52:40 -0700 Subject: [PATCH 2/8] Adding GAM Binary Classification samples. --- .../Trainers/BinaryClassification/Gam.cs | 147 +++++++++++++++++ .../BinaryClassification/GamWithOptions.cs | 150 ++++++++++++++++++ .../Dynamic/Trainers/Regression/Gam.cs | 6 +- .../Trainers/Regression/GamWithOptions.cs | 8 +- .../TreeTrainersCatalog.cs | 18 ++- 5 files changed, 318 insertions(+), 11 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs new file mode 100644 index 0000000000..90af558c0b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -0,0 +1,147 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic.Trainers.BinaryClassification +{ + public static class Gam + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create training and validation datasets. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create train and set set. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. + var trainer = mlContext.BinaryClassification.Trainers.Gam(maximumBinCountPerFeature: 16); + + // Fit the model to the data using a validation set. + // GAM will use a technique called validation pruning to tune the model after training + // to improve generalization. + var model = trainer.Fit(trainSet, validSet); + + // Extract the model parameters. + var gam = model.Model.SubModel; + + // Now investigate the bias and shape functions of the GAM model. + // The bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); + + // Let's take a look at the features that the model built. Similar to a linear model, we have + // one response per feature. Unlike a linear model, this response is a function instead of a line. + // Each feature response represents the deviation from the average prediction as a function of the + // feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + // Get the bin effects; these are the effect size for each bin. + var binEffects = gam.GetBinEffects(i); + + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart lookup table. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } + + // Expected output: + // Average prediction: 0.82 + // + // Feature0 + // x < -0.44 => 0.286 + // x < -0.38 => 0.225 + // x < -0.32 => 0.048 + // x < -0.26 => -0.110 + // x < -0.20 => -0.116 + // x < 0.18 => -0.143 + // x < 0.25 => -0.115 + // x < 0.31 => -0.005 + // x < 0.37 => 0.097 + // x < 0.44 => 0.263 + // x < ∞ => 0.284 + // + // Feature1 + // x < 0.00 => -0.350 + // x < 0.24 => 0.875 + // x < 0.31 => -0.138 + // x < ∞ => -0.188 + + // Let's consider this output. To score a given example, we look up the first bin where the inequality + // is satisfied for the feature value. We can look at the whole function to get a sense for how the + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: + // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model + // Distillation." arXiv:1710.06169." + } + + private class Data + { + public bool Label { get; set; } + + [VectorType(2)] + public float[] Features { get; set; } + } + + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) + { + // Generate random, uncoupled features. + var data = new Data { + Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + }; + // Compute the label from the shape functions and add noise. + data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5)) > 0.5; + + yield return data; + } + } + + private static float Parabola(float x) => x * x; + + private static float SimplePiecewise(float x) + { + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; + } + + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs new file mode 100644 index 0000000000..792ccffd9b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs @@ -0,0 +1,150 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Trainers.BinaryClassification +{ + public static class GamWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create training and validation datasets. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create train and set set. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. + var trainer = mlContext.BinaryClassification.Trainers.Gam( + new GamBinaryTrainer.Options { MaximumBinCountPerFeature = 16 }); + + // Fit the model to the data using a validation set. + // GAM will use a technique called validation pruning to tune the model after training + // to improve generalization. + var model = trainer.Fit(trainSet, validSet); + + // Extract the model parameters. + var gam = model.Model.SubModel; + + // Now investigate the bias and shape functions of the GAM model. + // The bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); + + // Let's take a look at the features that the model built. Similar to a linear model, we have + // one response per feature. Unlike a linear model, this response is a function instead of a line. + // Each feature response represents the deviation from the average prediction as a function of the + // feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + // Get the bin effects; these are the effect size for each bin. + var binEffects = gam.GetBinEffects(i); + + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart lookup table. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } + + // Expected output: + // Average prediction: 0.82 + // + // Feature0 + // x < -0.44 => 0.286 + // x < -0.38 => 0.225 + // x < -0.32 => 0.048 + // x < -0.26 => -0.110 + // x < -0.20 => -0.116 + // x < 0.18 => -0.143 + // x < 0.25 => -0.115 + // x < 0.31 => -0.005 + // x < 0.37 => 0.097 + // x < 0.44 => 0.263 + // x < ∞ => 0.284 + // + // Feature1 + // x < 0.00 => -0.350 + // x < 0.24 => 0.875 + // x < 0.31 => -0.138 + // x < ∞ => -0.188 + + // Let's consider this output. To score a given example, we look up the first bin where the inequality + // is satisfied for the feature value. We can look at the whole function to get a sense for how the + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: + // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model + // Distillation." arXiv:1710.06169." + } + + private class Data + { + public bool Label { get; set; } + + [VectorType(2)] + public float[] Features { get; set; } + } + + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < numExamples; i++) + { + // Generate random, uncoupled features. + var data = new Data + { + Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + }; + // Compute the label from the shape functions and add noise. + data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5)) > 0.5; + + yield return data; + } + } + + private static float Parabola(float x) => x * x; + + private static float SimplePiecewise(float x) + { + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; + } + + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs index be545c86dc..ec28053f24 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs @@ -42,9 +42,6 @@ public static void Example() // The bias represents the average prediction for the training data. Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); - // Expected output: - // Average predicted cost: 22.53 - // Let's take a look at the features that the model built. Similar to a linear model, we have // one response per feature. Unlike a linear model, this response is a function instead of a line. // Each feature response represents the deviation from the average prediction as a function of the @@ -84,7 +81,7 @@ public static void Example() // x < 0.37 => 0.040 // x < 0.44 => 0.083 // x < ∞ => 0.123 - + // // Feature1 // x < 0.00 => -0.245 // x < 0.06 => 0.671 @@ -110,6 +107,7 @@ public static void Example() private class Data { public float Label { get; set; } + [VectorType(2)] public float[] Features { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs index 45e69e3562..c03621bbba 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs @@ -44,9 +44,6 @@ public static void Example() // The bias represents the average prediction for the training data. Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); - // Expected output: - // Average predicted cost: 22.53 - // Let's take a look at the features that the model built. Similar to a linear model, we have // one response per feature. Unlike a linear model, this response is a function instead of a line. // Each feature response represents the deviation from the average prediction as a function of the @@ -86,7 +83,7 @@ public static void Example() // x < 0.37 => 0.040 // x < 0.44 => 0.083 // x < ∞ => 0.123 - + // // Feature1 // x < 0.00 => -0.245 // x < 0.06 => 0.671 @@ -94,7 +91,7 @@ public static void Example() // x < 0.31 => -0.141 // x < 0.37 => -0.241 // x < ∞ => -0.248 - + // Let's consider this output. To score a given example, we look up the first bin where the inequality // is satisfied for the feature value. We can look at the whole function to get a sense for how the // model responds to the variable on a global level. @@ -112,6 +109,7 @@ public static void Example() private class Data { public float Label { get; set; } + [VectorType(2)] public float[] Features { get; set; } } diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 02ab57a7fa..7031a673da 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -173,6 +173,13 @@ public static FastTreeRankingTrainer FastTree(this RankingCatalog.RankingTrainer /// The number of iterations to use in learning the features. /// The maximum number of bins to use to approximate features. /// The learning rate. GAMs work best with a small learning rate. + /// + /// + /// + /// + /// public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, @@ -191,6 +198,13 @@ public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassi /// /// The . /// Trainer options. + /// + /// + /// + /// + /// public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, GamBinaryTrainer.Options options) { @@ -212,7 +226,7 @@ public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassi /// /// /// /// /// @@ -237,7 +251,7 @@ public static GamRegressionTrainer Gam(this RegressionCatalog.RegressionTrainers /// /// /// /// /// From 73d8270f3b3ef63296a4a6d56fc21f5908cb8e22 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 11 Apr 2019 10:46:21 -0700 Subject: [PATCH 3/8] Update docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs Co-Authored-By: rogancarr --- .../Dynamic/Trainers/BinaryClassification/Gam.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs index 90af558c0b..aee31e62b7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -27,7 +27,7 @@ public static void Example() var validSet = dataSets.TestSet; // Create a GAM trainer. - // Use a small number of bins for this example. + // Use a small number of bins for this example. The setting below means for each feature, we divide its range into 16 independent discrete regions. For example, if a feature `Age`'s origin range is from 0 to 255. The first region might be 0-15 and the second region 16-31. var trainer = mlContext.BinaryClassification.Trainers.Gam(maximumBinCountPerFeature: 16); // Fit the model to the data using a validation set. From 4469a8562534ab1d68fb7e4075d7f571c42b4607 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 11 Apr 2019 10:46:31 -0700 Subject: [PATCH 4/8] Update docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs Co-Authored-By: rogancarr --- .../Dynamic/Trainers/BinaryClassification/Gam.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs index aee31e62b7..ddd9e74cbd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -21,7 +21,7 @@ public static void Example() // Convert the dataset to an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); - // Create train and set set. + // Create training and validation sets. var dataSets = mlContext.Data.TrainTestSplit(data); var trainSet = dataSets.TrainSet; var validSet = dataSets.TestSet; From 091fafd1a1c968588c00ae9ca16e8dd6ff0b670d Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 11 Apr 2019 10:46:47 -0700 Subject: [PATCH 5/8] Update docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs Co-Authored-By: rogancarr --- .../Dynamic/Trainers/BinaryClassification/Gam.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs index ddd9e74cbd..463af1f93c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -43,7 +43,7 @@ public static void Example() Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); // Let's take a look at the features that the model built. Similar to a linear model, we have - // one response per feature. Unlike a linear model, this response is a function instead of a line. + // one response per feature. Unlike a linear model, this response is a step function (https://en.wikipedia.org/wiki/Step_function) instead of a line. // Each feature response represents the deviation from the average prediction as a function of the // feature value. for (int i = 0; i < gam.NumberOfShapeFunctions; i++) From 7f7c991cf3172e4b55de12bb8632cdfb9fe476f1 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 11 Apr 2019 10:46:57 -0700 Subject: [PATCH 6/8] Update docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs Co-Authored-By: rogancarr --- .../Dynamic/Trainers/BinaryClassification/Gam.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs index 463af1f93c..8b29087de0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -30,7 +30,7 @@ public static void Example() // Use a small number of bins for this example. The setting below means for each feature, we divide its range into 16 independent discrete regions. For example, if a feature `Age`'s origin range is from 0 to 255. The first region might be 0-15 and the second region 16-31. var trainer = mlContext.BinaryClassification.Trainers.Gam(maximumBinCountPerFeature: 16); - // Fit the model to the data using a validation set. + // Fit the model using both of training and validation sets. // GAM will use a technique called validation pruning to tune the model after training // to improve generalization. var model = trainer.Fit(trainSet, validSet); From 1c992772d3da30ab64b0a420738291f47cf36b52 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Thu, 11 Apr 2019 11:11:52 -0700 Subject: [PATCH 7/8] Addressing PR comments; rewriting comments. --- .../Trainers/BinaryClassification/Gam.cs | 32 ++++++++++------- .../BinaryClassification/GamWithOptions.cs | 34 ++++++++++-------- .../Dynamic/Trainers/Regression/Gam.cs | 36 +++++++++++-------- .../Trainers/Regression/GamWithOptions.cs | 26 ++++++++------ 4 files changed, 76 insertions(+), 52 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs index 8b29087de0..a9f54f7f65 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -15,7 +15,7 @@ public static void Example() // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create training and validation datasets. + // Create the dataset. var samples = GenerateData(); // Convert the dataset to an IDataView. @@ -27,25 +27,30 @@ public static void Example() var validSet = dataSets.TestSet; // Create a GAM trainer. - // Use a small number of bins for this example. The setting below means for each feature, we divide its range into 16 independent discrete regions. For example, if a feature `Age`'s origin range is from 0 to 255. The first region might be 0-15 and the second region 16-31. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. var trainer = mlContext.BinaryClassification.Trainers.Gam(maximumBinCountPerFeature: 16); - // Fit the model using both of training and validation sets. - // GAM will use a technique called validation pruning to tune the model after training - // to improve generalization. + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. var model = trainer.Fit(trainSet, validSet); // Extract the model parameters. var gam = model.Model.SubModel; - // Now investigate the bias and shape functions of the GAM model. - // The bias represents the average prediction for the training data. + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); - // Let's take a look at the features that the model built. Similar to a linear model, we have - // one response per feature. Unlike a linear model, this response is a step function (https://en.wikipedia.org/wiki/Step_function) instead of a line. - // Each feature response represents the deviation from the average prediction as a function of the - // feature value. + // Now look at the shape functions that the model has learned. Similar to a linear model, we have + // one response per feature, and they are independent. Unlike a linear model, this response is a + // generic function instead of a line. Because we have included a bias term, each feature response + // represents the deviation from the average prediction as a function of the feature value. for (int i = 0; i < gam.NumberOfShapeFunctions; i++) { // Break a line. @@ -53,11 +58,12 @@ public static void Example() // Get the bin upper bounds for the feature. var binUpperBounds = gam.GetBinUpperBounds(i); - // Get the bin effects; these are the effect size for each bin. + + // Get the bin effects; these are the function values for each bin. var binEffects = gam.GetBinEffects(i); // Now, write the function to the console. The function is a set of bins, and the corresponding - // function values. You can think of GAMs as building a bar-chart lookup table. + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. Console.WriteLine($"Feature{i}"); for (int j = 0; j < binUpperBounds.Count; j++) Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs index 792ccffd9b..40697a2322 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs @@ -16,38 +16,43 @@ public static void Example() // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create training and validation datasets. + // Create the dataset. var samples = GenerateData(); // Convert the dataset to an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); - // Create train and set set. + // Create training and validation datasets. var dataSets = mlContext.Data.TrainTestSplit(data); var trainSet = dataSets.TrainSet; var validSet = dataSets.TestSet; // Create a GAM trainer. - // Use a small number of bins for this example. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. var trainer = mlContext.BinaryClassification.Trainers.Gam( new GamBinaryTrainer.Options { MaximumBinCountPerFeature = 16 }); - // Fit the model to the data using a validation set. - // GAM will use a technique called validation pruning to tune the model after training - // to improve generalization. + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. var model = trainer.Fit(trainSet, validSet); // Extract the model parameters. var gam = model.Model.SubModel; - // Now investigate the bias and shape functions of the GAM model. - // The bias represents the average prediction for the training data. + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); - // Let's take a look at the features that the model built. Similar to a linear model, we have - // one response per feature. Unlike a linear model, this response is a function instead of a line. - // Each feature response represents the deviation from the average prediction as a function of the - // feature value. + // Now look at the shape functions that the model has learned. Similar to a linear model, we have + // one response per feature, and they are independent. Unlike a linear model, this response is a + // generic function instead of a line. Because we have included a bias term, each feature response + // represents the deviation from the average prediction as a function of the feature value. for (int i = 0; i < gam.NumberOfShapeFunctions; i++) { // Break a line. @@ -55,11 +60,12 @@ public static void Example() // Get the bin upper bounds for the feature. var binUpperBounds = gam.GetBinUpperBounds(i); - // Get the bin effects; these are the effect size for each bin. + + // Get the bin effects; these are the function values for each bin. var binEffects = gam.GetBinEffects(i); // Now, write the function to the console. The function is a set of bins, and the corresponding - // function values. You can think of GAMs as building a bar-chart lookup table. + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. Console.WriteLine($"Feature{i}"); for (int j = 0; j < binUpperBounds.Count; j++) Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs index ec28053f24..ac4aebff2d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs @@ -14,38 +14,43 @@ public static void Example() // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - - // Create training and validation datasets. + + // Create the dataset. var samples = GenerateData(); // Convert the dataset to an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); - // Create train and set set. + // Create training and validation sets. var dataSets = mlContext.Data.TrainTestSplit(data); var trainSet = dataSets.TrainSet; var validSet = dataSets.TestSet; // Create a GAM trainer. - // Use a small number of bins for this example. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. var trainer = mlContext.Regression.Trainers.Gam(maximumBinCountPerFeature: 16); - // Fit the model to the data using a validation set. - // GAM will use a technique called validation pruning to tune the model after training - // to improve generalization. + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. var model = trainer.Fit(trainSet, validSet); // Extract the model parameters. var gam = model.Model; - // Now investigate the bias and shape functions of the GAM model. - // The bias represents the average prediction for the training data. + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); - // Let's take a look at the features that the model built. Similar to a linear model, we have - // one response per feature. Unlike a linear model, this response is a function instead of a line. - // Each feature response represents the deviation from the average prediction as a function of the - // feature value. + // Now look at the shape functions that the model has learned. Similar to a linear model, we have + // one response per feature, and they are independent. Unlike a linear model, this response is a + // generic function instead of a line. Because we have included a bias term, each feature response + // represents the deviation from the average prediction as a function of the feature value. for (int i = 0; i < gam.NumberOfShapeFunctions; i++) { // Break a line. @@ -53,11 +58,12 @@ public static void Example() // Get the bin upper bounds for the feature. var binUpperBounds = gam.GetBinUpperBounds(i); - // Get the bin effects; these are the effect size for each bin. + + // Get the bin effects; these are the function values for each bin. var binEffects = gam.GetBinEffects(i); // Now, write the function to the console. The function is a set of bins, and the corresponding - // function values. You can think of GAMs as building a bar-chart lookup table. + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. Console.WriteLine($"Feature{i}"); for (int j = 0; j < binUpperBounds.Count; j++) Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs index c03621bbba..d59cae4db0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs @@ -16,32 +16,37 @@ public static void Example() // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create training and validation datasets. + // Create the dataset. var samples = GenerateData(); // Convert the dataset to an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); - // Create train and set set. + // Create training and validation sets. var dataSets = mlContext.Data.TrainTestSplit(data); var trainSet = dataSets.TrainSet; var validSet = dataSets.TestSet; // Create a GAM trainer. - // Use a small number of bins for this example. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. var trainer = mlContext.Regression.Trainers.Gam( new GamRegressionTrainer.Options { MaximumBinCountPerFeature = 16 }); - // Fit the model to the data using a validation set. - // GAM will use a technique called validation pruning to tune the model after training - // to improve generalization. + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. var model = trainer.Fit(trainSet, validSet); // Extract the model parameters. var gam = model.Model; - // Now investigate the bias and shape functions of the GAM model. - // The bias represents the average prediction for the training data. + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); // Let's take a look at the features that the model built. Similar to a linear model, we have @@ -55,11 +60,12 @@ public static void Example() // Get the bin upper bounds for the feature. var binUpperBounds = gam.GetBinUpperBounds(i); - // Get the bin effects; these are the effect size for each bin. + + // Get the bin effects; these are the function values for each bin. var binEffects = gam.GetBinEffects(i); // Now, write the function to the console. The function is a set of bins, and the corresponding - // function values. You can think of GAMs as building a bar-chart lookup table. + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. Console.WriteLine($"Feature{i}"); for (int j = 0; j < binUpperBounds.Count; j++) Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); From a717ac54600344f05d03612674df46aa77556c2c Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Fri, 12 Apr 2019 14:02:33 -0700 Subject: [PATCH 8/8] Addressing PR comments. --- .../Trainers/BinaryClassification/Gam.cs | 5 ++- .../BinaryClassification/GamWithOptions.cs | 13 ++++-- .../Dynamic/Trainers/Regression/Gam.cs | 5 ++- .../Trainers/Regression/GamWithOptions.cs | 41 +++++++++++++++++-- 4 files changed, 54 insertions(+), 10 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs index a9f54f7f65..3b9c36644f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -123,14 +123,15 @@ private class Data private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) { var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); for (int i = 0; i < numExamples; i++) { // Generate random, uncoupled features. var data = new Data { - Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + Features = new float[2] { centeredFloat(), centeredFloat() } }; // Compute the label from the shape functions and add noise. - data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5)) > 0.5; + data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat()) > 0.5; yield return data; } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs index 40697a2322..e4a408a3ae 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs @@ -34,8 +34,14 @@ public static void Example() // neighboring bins with identical values will be combined. In general, we recommend using // at least the default number of bins, as a small number of bins limits the capacity of // the model. + // Also, set the learning rate to half the default to slow down the gradient descent, and + // double the number of iterations to compensate. var trainer = mlContext.BinaryClassification.Trainers.Gam( - new GamBinaryTrainer.Options { MaximumBinCountPerFeature = 16 }); + new GamBinaryTrainer.Options { + NumberOfIterations = 19000, + MaximumBinCountPerFeature = 16, + LearningRate = 0.001 + }); // Fit the model using both of training and validation sets. GAM can use a technique called // pruning to tune the model to the validation set after training to improve generalization. @@ -125,15 +131,16 @@ private class Data private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) { var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); for (int i = 0; i < numExamples; i++) { // Generate random, uncoupled features. var data = new Data { - Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + Features = new float[2] { centeredFloat(), centeredFloat() } }; // Compute the label from the shape functions and add noise. - data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5)) > 0.5; + data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat()) > 0.5; yield return data; } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs index ac4aebff2d..b070dfbda1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs @@ -129,14 +129,15 @@ private static IEnumerable GenerateData(int numExamples = 25000, int seed { float bias = 1.0f; var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); for (int i = 0; i < numExamples; i++) { // Generate random, uncoupled features. var data = new Data { - Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + Features = new float[2] { centeredFloat(), centeredFloat() } }; // Compute the label from the shape functions and add noise. - data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5); + data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat(); yield return data; } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs index d59cae4db0..6c973814fd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs @@ -34,8 +34,12 @@ public static void Example() // neighboring bins with identical values will be combined. In general, we recommend using // at least the default number of bins, as a small number of bins limits the capacity of // the model. + // Also, change the pruning metrics to use the mean absolute error for pruning. var trainer = mlContext.Regression.Trainers.Gam( - new GamRegressionTrainer.Options { MaximumBinCountPerFeature = 16 }); + new GamRegressionTrainer.Options { + MaximumBinCountPerFeature = 16, + PruningMetrics = 1 + }); // Fit the model using both of training and validation sets. GAM can use a technique called // pruning to tune the model to the validation set after training to improve generalization. @@ -112,6 +116,36 @@ public static void Example() // Distillation." arXiv:1710.06169." } +// Feature0 +//x< -0.44 => 0.131 +//x< -0.38 => 0.067 +//x< -0.32 => 0.041 +//x< -0.26 => -0.005 +//x< -0.20 => -0.035 +//x< -0.13 => -0.050 +//x< -0.07 => -0.079 +//x< -0.01 => -0.083 +//x< 0.06 => -0.079 +//x< 0.12 => -0.075 +//x< 0.18 => -0.052 +//x< 0.25 => -0.030 +//x< 0.31 => -0.002 +//x< 0.37 => 0.041 +//x< 0.44 => 0.084 +//x< ∞ => 0.126 + +//Feature1 +//x< -0.37 => -0.255 +//x< -0.25 => -0.247 +//x< 0.00 => -0.249 +//x< 0.06 => 0.671 +//x< 0.12 => 0.743 +//x< 0.24 => 0.746 +//x< 0.31 => -0.143 +//x< 0.37 => -0.245 +//x< 0.43 => -0.261 +//x< ∞ => -0.257 + private class Data { public float Label { get; set; } @@ -131,15 +165,16 @@ private static IEnumerable GenerateData(int numExamples = 25000, int seed { float bias = 1.0f; var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); for (int i = 0; i < numExamples; i++) { // Generate random, uncoupled features. var data = new Data { - Features = new float[2] { (float)(rng.NextDouble() - 0.5), (float)(rng.NextDouble() - 0.5) } + Features = new float[2] { centeredFloat(), centeredFloat() } }; // Compute the label from the shape functions and add noise. - data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + (float)(rng.NextDouble() - 0.5); + data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat(); yield return data; }