diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs
new file mode 100644
index 0000000000..3b9c36644f
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs
@@ -0,0 +1,154 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML;
+using Microsoft.ML.Data;
+
+namespace Samples.Dynamic.Trainers.BinaryClassification
+{
+ public static class Gam
+ {
+ // This example requires installation of additional NuGet package
+ // Microsoft.ML.FastTree.
+ public static void Example()
+ {
+ // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
+ // as a catalog of available operations and as the source of randomness.
+ var mlContext = new MLContext();
+
+ // Create the dataset.
+ var samples = GenerateData();
+
+ // Convert the dataset to an IDataView.
+ var data = mlContext.Data.LoadFromEnumerable(samples);
+
+ // Create training and validation sets.
+ var dataSets = mlContext.Data.TrainTestSplit(data);
+ var trainSet = dataSets.TrainSet;
+ var validSet = dataSets.TestSet;
+
+ // Create a GAM trainer.
+ // Use a small number of bins for this example. The setting below means for each feature,
+ // we divide its range into 16 discrete regions for the training process. Note that these
+ // regions are not evenly spaced, and that the final model may contain fewer bins, as
+ // neighboring bins with identical values will be combined. In general, we recommend using
+ // at least the default number of bins, as a small number of bins limits the capacity of
+ // the model.
+ var trainer = mlContext.BinaryClassification.Trainers.Gam(maximumBinCountPerFeature: 16);
+
+ // Fit the model using both of training and validation sets. GAM can use a technique called
+ // pruning to tune the model to the validation set after training to improve generalization.
+ var model = trainer.Fit(trainSet, validSet);
+
+ // Extract the model parameters.
+ var gam = model.Model.SubModel;
+
+ // Now we can inspect the parameters of the Generalized Additive Model to understand the fit
+ // and potentially learn about our dataset.
+ // First, we will look at the bias; the bias represents the average prediction for the training data.
+ Console.WriteLine($"Average prediction: {gam.Bias:0.00}");
+
+ // Now look at the shape functions that the model has learned. Similar to a linear model, we have
+ // one response per feature, and they are independent. Unlike a linear model, this response is a
+ // generic function instead of a line. Because we have included a bias term, each feature response
+ // represents the deviation from the average prediction as a function of the feature value.
+ for (int i = 0; i < gam.NumberOfShapeFunctions; i++)
+ {
+ // Break a line.
+ Console.WriteLine();
+
+ // Get the bin upper bounds for the feature.
+ var binUpperBounds = gam.GetBinUpperBounds(i);
+
+ // Get the bin effects; these are the function values for each bin.
+ var binEffects = gam.GetBinEffects(i);
+
+ // Now, write the function to the console. The function is a set of bins, and the corresponding
+ // function values. You can think of GAMs as building a bar-chart or lookup table for each feature.
+ Console.WriteLine($"Feature{i}");
+ for (int j = 0; j < binUpperBounds.Count; j++)
+ Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}");
+ }
+
+ // Expected output:
+ // Average prediction: 0.82
+ //
+ // Feature0
+ // x < -0.44 => 0.286
+ // x < -0.38 => 0.225
+ // x < -0.32 => 0.048
+ // x < -0.26 => -0.110
+ // x < -0.20 => -0.116
+ // x < 0.18 => -0.143
+ // x < 0.25 => -0.115
+ // x < 0.31 => -0.005
+ // x < 0.37 => 0.097
+ // x < 0.44 => 0.263
+ // x < ∞ => 0.284
+ //
+ // Feature1
+ // x < 0.00 => -0.350
+ // x < 0.24 => 0.875
+ // x < 0.31 => -0.138
+ // x < ∞ => -0.188
+
+ // Let's consider this output. To score a given example, we look up the first bin where the inequality
+ // is satisfied for the feature value. We can look at the whole function to get a sense for how the
+ // model responds to the variable on a global level.
+ // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average
+ // expected output over the training set. Very few bins are used to model the second feature because the GAM model
+ // discards unchanged bins to create smaller models.
+ // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be
+ // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use
+ // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is
+ // real or just sampling noise. See for example:
+ // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model
+ // Distillation." arXiv:1710.06169."
+ }
+
+ private class Data
+ {
+ public bool Label { get; set; }
+
+ [VectorType(2)]
+ public float[] Features { get; set; }
+ }
+
+ ///
+ /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0,
+ /// while Feature2 is a simple piecewise function.
+ ///
+ /// The number of examples to generate.
+ /// The seed for the random number generator used to produce data.
+ ///
+ private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1)
+ {
+ var rng = new Random(seed);
+ float centeredFloat() => (float)(rng.NextDouble() - 0.5);
+ for (int i = 0; i < numExamples; i++)
+ {
+ // Generate random, uncoupled features.
+ var data = new Data {
+ Features = new float[2] { centeredFloat(), centeredFloat() }
+ };
+ // Compute the label from the shape functions and add noise.
+ data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat()) > 0.5;
+
+ yield return data;
+ }
+ }
+
+ private static float Parabola(float x) => x * x;
+
+ private static float SimplePiecewise(float x)
+ {
+ if (x < 0)
+ return 0;
+ else if (x < 0.25)
+ return 1;
+ else
+ return 0;
+ }
+
+ private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x));
+ }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs
new file mode 100644
index 0000000000..e4a408a3ae
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs
@@ -0,0 +1,163 @@
+using System;
+using System.Collections.Generic;
+using Microsoft.ML;
+using Microsoft.ML.Data;
+using Microsoft.ML.Trainers.FastTree;
+
+namespace Samples.Dynamic.Trainers.BinaryClassification
+{
+ public static class GamWithOptions
+ {
+ // This example requires installation of additional NuGet package
+ // Microsoft.ML.FastTree.
+ public static void Example()
+ {
+ // Create a new context for ML.NET operations. It can be used for exception tracking and logging,
+ // as a catalog of available operations and as the source of randomness.
+ var mlContext = new MLContext();
+
+ // Create the dataset.
+ var samples = GenerateData();
+
+ // Convert the dataset to an IDataView.
+ var data = mlContext.Data.LoadFromEnumerable(samples);
+
+ // Create training and validation datasets.
+ var dataSets = mlContext.Data.TrainTestSplit(data);
+ var trainSet = dataSets.TrainSet;
+ var validSet = dataSets.TestSet;
+
+ // Create a GAM trainer.
+ // Use a small number of bins for this example. The setting below means for each feature,
+ // we divide its range into 16 discrete regions for the training process. Note that these
+ // regions are not evenly spaced, and that the final model may contain fewer bins, as
+ // neighboring bins with identical values will be combined. In general, we recommend using
+ // at least the default number of bins, as a small number of bins limits the capacity of
+ // the model.
+ // Also, set the learning rate to half the default to slow down the gradient descent, and
+ // double the number of iterations to compensate.
+ var trainer = mlContext.BinaryClassification.Trainers.Gam(
+ new GamBinaryTrainer.Options {
+ NumberOfIterations = 19000,
+ MaximumBinCountPerFeature = 16,
+ LearningRate = 0.001
+ });
+
+ // Fit the model using both of training and validation sets. GAM can use a technique called
+ // pruning to tune the model to the validation set after training to improve generalization.
+ var model = trainer.Fit(trainSet, validSet);
+
+ // Extract the model parameters.
+ var gam = model.Model.SubModel;
+
+ // Now we can inspect the parameters of the Generalized Additive Model to understand the fit
+ // and potentially learn about our dataset.
+ // First, we will look at the bias; the bias represents the average prediction for the training data.
+ Console.WriteLine($"Average prediction: {gam.Bias:0.00}");
+
+ // Now look at the shape functions that the model has learned. Similar to a linear model, we have
+ // one response per feature, and they are independent. Unlike a linear model, this response is a
+ // generic function instead of a line. Because we have included a bias term, each feature response
+ // represents the deviation from the average prediction as a function of the feature value.
+ for (int i = 0; i < gam.NumberOfShapeFunctions; i++)
+ {
+ // Break a line.
+ Console.WriteLine();
+
+ // Get the bin upper bounds for the feature.
+ var binUpperBounds = gam.GetBinUpperBounds(i);
+
+ // Get the bin effects; these are the function values for each bin.
+ var binEffects = gam.GetBinEffects(i);
+
+ // Now, write the function to the console. The function is a set of bins, and the corresponding
+ // function values. You can think of GAMs as building a bar-chart or lookup table for each feature.
+ Console.WriteLine($"Feature{i}");
+ for (int j = 0; j < binUpperBounds.Count; j++)
+ Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}");
+ }
+
+ // Expected output:
+ // Average prediction: 0.82
+ //
+ // Feature0
+ // x < -0.44 => 0.286
+ // x < -0.38 => 0.225
+ // x < -0.32 => 0.048
+ // x < -0.26 => -0.110
+ // x < -0.20 => -0.116
+ // x < 0.18 => -0.143
+ // x < 0.25 => -0.115
+ // x < 0.31 => -0.005
+ // x < 0.37 => 0.097
+ // x < 0.44 => 0.263
+ // x < ∞ => 0.284
+ //
+ // Feature1
+ // x < 0.00 => -0.350
+ // x < 0.24 => 0.875
+ // x < 0.31 => -0.138
+ // x < ∞ => -0.188
+
+ // Let's consider this output. To score a given example, we look up the first bin where the inequality
+ // is satisfied for the feature value. We can look at the whole function to get a sense for how the
+ // model responds to the variable on a global level.
+ // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average
+ // expected output over the training set. Very few bins are used to model the second feature because the GAM model
+ // discards unchanged bins to create smaller models.
+ // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be
+ // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use
+ // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is
+ // real or just sampling noise. See for example:
+ // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model
+ // Distillation." arXiv:1710.06169."
+ }
+
+ private class Data
+ {
+ public bool Label { get; set; }
+
+ [VectorType(2)]
+ public float[] Features { get; set; }
+ }
+
+ ///
+ /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0,
+ /// while Feature2 is a simple piecewise function.
+ ///
+ /// The number of examples to generate.
+ /// The seed for the random number generator used to produce data.
+ ///
+ private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1)
+ {
+ var rng = new Random(seed);
+ float centeredFloat() => (float)(rng.NextDouble() - 0.5);
+ for (int i = 0; i < numExamples; i++)
+ {
+ // Generate random, uncoupled features.
+ var data = new Data
+ {
+ Features = new float[2] { centeredFloat(), centeredFloat() }
+ };
+ // Compute the label from the shape functions and add noise.
+ data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat()) > 0.5;
+
+ yield return data;
+ }
+ }
+
+ private static float Parabola(float x) => x * x;
+
+ private static float SimplePiecewise(float x)
+ {
+ if (x < 0)
+ return 0;
+ else if (x < 0.25)
+ return 1;
+ else
+ return 0;
+ }
+
+ private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x));
+ }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs
index dd8107452a..b070dfbda1 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs
@@ -1,7 +1,7 @@
using System;
-using System.Linq;
+using System.Collections.Generic;
using Microsoft.ML;
-using Microsoft.ML.SamplesUtils;
+using Microsoft.ML.Data;
namespace Samples.Dynamic.Trainers.Regression
{
@@ -14,93 +14,145 @@ public static void Example()
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
var mlContext = new MLContext();
+
+ // Create the dataset.
+ var samples = GenerateData();
+
+ // Convert the dataset to an IDataView.
+ var data = mlContext.Data.LoadFromEnumerable(samples);
+
+ // Create training and validation sets.
+ var dataSets = mlContext.Data.TrainTestSplit(data);
+ var trainSet = dataSets.TrainSet;
+ var validSet = dataSets.TestSet;
+
+ // Create a GAM trainer.
+ // Use a small number of bins for this example. The setting below means for each feature,
+ // we divide its range into 16 discrete regions for the training process. Note that these
+ // regions are not evenly spaced, and that the final model may contain fewer bins, as
+ // neighboring bins with identical values will be combined. In general, we recommend using
+ // at least the default number of bins, as a small number of bins limits the capacity of
+ // the model.
+ var trainer = mlContext.Regression.Trainers.Gam(maximumBinCountPerFeature: 16);
+
+ // Fit the model using both of training and validation sets. GAM can use a technique called
+ // pruning to tune the model to the validation set after training to improve generalization.
+ var model = trainer.Fit(trainSet, validSet);
- // Read the Housing regression dataset
- var data = DatasetUtils.LoadHousingRegressionDataset(mlContext);
-
- var labelName = "MedianHomeValue";
- var featureNames = data.Schema
- .Select(column => column.Name) // Get the column names
- .Where(name => name != labelName) // Drop the Label
- .ToArray();
-
- // Create a pipeline.
- var pipeline =
- // Concatenate the features to create a Feature vector.
- mlContext.Transforms.Concatenate("Features", featureNames)
- // Append a GAM regression trainer, setting the "MedianHomeValue" column as the label of the dataset,
- // the "Features" column produced by concatenation as the features column,
- // and use a small number of bins to make it easy to visualize in the console window.
- // For real applications, it is recommended to start with the default number of bins.
- .Append(mlContext.Regression.Trainers.Gam(labelColumnName: labelName, featureColumnName: "Features", maximumBinCountPerFeature: 16));
-
- // Train the pipeline.
- var trainedPipeline = pipeline.Fit(data);
-
- // Extract the model from the pipeline.
- var gamModel = trainedPipeline.LastTransformer.Model;
-
- // Now investigate the bias and shape functions of the GAM model.
- // The bias represents the average prediction for the training data.
- Console.WriteLine($"Average predicted cost: {gamModel.Bias:0.00}");
+ // Extract the model parameters.
+ var gam = model.Model;
+
+ // Now we can inspect the parameters of the Generalized Additive Model to understand the fit
+ // and potentially learn about our dataset.
+ // First, we will look at the bias; the bias represents the average prediction for the training data.
+ Console.WriteLine($"Average prediction: {gam.Bias:0.00}");
+
+ // Now look at the shape functions that the model has learned. Similar to a linear model, we have
+ // one response per feature, and they are independent. Unlike a linear model, this response is a
+ // generic function instead of a line. Because we have included a bias term, each feature response
+ // represents the deviation from the average prediction as a function of the feature value.
+ for (int i = 0; i < gam.NumberOfShapeFunctions; i++)
+ {
+ // Break a line.
+ Console.WriteLine();
+
+ // Get the bin upper bounds for the feature.
+ var binUpperBounds = gam.GetBinUpperBounds(i);
+
+ // Get the bin effects; these are the function values for each bin.
+ var binEffects = gam.GetBinEffects(i);
+
+ // Now, write the function to the console. The function is a set of bins, and the corresponding
+ // function values. You can think of GAMs as building a bar-chart or lookup table for each feature.
+ Console.WriteLine($"Feature{i}");
+ for (int j = 0; j < binUpperBounds.Count; j++)
+ Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}");
+ }
// Expected output:
- // Average predicted cost: 22.53
-
- // Let's take a look at the features that the model built. Similar to a linear model, we have
- // one response per feature. Unlike a linear model, this response is a function instead of a line.
- // Each feature response represents the deviation from the average prediction as a function of the
- // feature value.
-
- // Let's investigate the TeacherRatio variable. This is the ratio of students to teachers,
- // so the higher it is, the more students a teacher has in their classroom.
- // First, let's get the index of the variable we want to look at.
- var studentTeacherRatioIndex = featureNames.ToList().FindIndex(str => str.Equals("TeacherRatio"));
-
- // Next, let's get the array of histogram bin upper bounds from the model for this feature.
- // For each feature, the shape function is calculated at `MaxBins` locations along the range of
- // values that the feature takes, and the resulting shape function can be seen as a histogram of
- // effects.
- var teacherRatioBinUpperBounds = gamModel.GetBinUpperBounds(studentTeacherRatioIndex);
- // And the array of bin effects; these are the effect size for each bin.
- var teacherRatioBinEffects = gamModel.GetBinEffects(studentTeacherRatioIndex);
-
- // Now, write the function to the console. The function is a set of bins, and the corresponding
- // function values. You can think of GAMs as building a bar-chart lookup table.
- Console.WriteLine("Student-Teacher Ratio");
- for (int i = 0; i < teacherRatioBinUpperBounds.Count; i++)
- Console.WriteLine($"x < {teacherRatioBinUpperBounds[i]:0.00} => {teacherRatioBinEffects[i]:0.000}");
-
- // Expected output:
- // Student-Teacher Ratio
- // x < 14.55 => 2.105
- // x < 14.75 => 2.326
- // x < 15.40 => 0.903
- // x < 16.50 => 0.651
- // x < 17.15 => 0.587
- // x < 17.70 => 0.624
- // x < 17.85 => 0.684
- // x < 18.35 => -0.315
- // x < 18.55 => -0.542
- // x < 18.75 => -0.083
- // x < 19.40 => -0.442
- // x < 20.55 => -0.649
- // x < 21.05 => -1.579
- // x < ∞ => 0.318
+ // Average prediction: 1.33
+ //
+ // Feature0
+ // x < -0.44 => 0.128
+ // x < -0.38 => 0.066
+ // x < -0.32 => 0.040
+ // x < -0.26 => -0.006
+ // x < -0.20 => -0.035
+ // x < -0.13 => -0.050
+ // x < 0.06 => -0.077
+ // x < 0.12 => -0.075
+ // x < 0.18 => -0.052
+ // x < 0.25 => -0.031
+ // x < 0.31 => -0.002
+ // x < 0.37 => 0.040
+ // x < 0.44 => 0.083
+ // x < ∞ => 0.123
+ //
+ // Feature1
+ // x < 0.00 => -0.245
+ // x < 0.06 => 0.671
+ // x < 0.24 => 0.723
+ // x < 0.31 => -0.141
+ // x < 0.37 => -0.241
+ // x < ∞ => -0.248
// Let's consider this output. To score a given example, we look up the first bin where the inequality
// is satisfied for the feature value. We can look at the whole function to get a sense for how the
- // model responds to the variable on a global level. For the student-teacher-ratio variable, we can see
- // that smaller class sizes are predictive of a higher house value, while student-teacher ratios higher
- // than about 18 lead to lower predictions in house value. This makes intuitive sense, as smaller class
- // sizes are desirable and also indicative of better-funded schools, which could make buyers likely to
- // pay more for the house.
-
- // Another thing to notice is that these feature functions can be noisy. See student-teacher ratios > 21.05.
- // Common practice is to use resampling methods to estimate a confidence interval at each bin. This will
- // help to determine if the effect is real or just sampling noise. See for example
+ // model responds to the variable on a global level.
+ // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average
+ // expected output over the training set. Very few bins are used to model the second feature because the GAM model
+ // discards unchanged bins to create smaller models.
+ // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be
+ // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use
+ // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is
+ // real or just sampling noise. See for example:
// Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model
// Distillation." arXiv:1710.06169."
}
+
+ private class Data
+ {
+ public float Label { get; set; }
+
+ [VectorType(2)]
+ public float[] Features { get; set; }
+ }
+
+ ///
+ /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0,
+ /// while Feature2 is a simple piecewise function.
+ ///
+ /// The number of examples to generate.
+ /// The seed for the random number generator used to produce data.
+ ///
+ private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1)
+ {
+ float bias = 1.0f;
+ var rng = new Random(seed);
+ float centeredFloat() => (float)(rng.NextDouble() - 0.5);
+ for (int i = 0; i < numExamples; i++)
+ {
+ // Generate random, uncoupled features.
+ var data = new Data {
+ Features = new float[2] { centeredFloat(), centeredFloat() }
+ };
+ // Compute the label from the shape functions and add noise.
+ data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat();
+
+ yield return data;
+ }
+ }
+
+ private static float Parabola(float x) => x * x;
+
+ private static float SimplePiecewise(float x)
+ {
+ if (x < 0)
+ return 0;
+ else if (x < 0.25)
+ return 1;
+ else
+ return 0;
+ }
}
}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs
index 33617b2d94..6c973814fd 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
-using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers.FastTree;
@@ -15,92 +14,182 @@ public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
- // Setting the seed to a fixed number in this example to make outputs deterministic.
- var mlContext = new MLContext(seed: 0);
-
- // Create a list of training examples.
- var examples = GenerateRandomDataPoints(1000);
-
- // Convert the examples list to an IDataView object, which is consumable by ML.NET API.
- var trainingData = mlContext.Data.LoadFromEnumerable(examples);
-
- // Define trainer options.
- var options = new GamRegressionTrainer.Options
+ var mlContext = new MLContext();
+
+ // Create the dataset.
+ var samples = GenerateData();
+
+ // Convert the dataset to an IDataView.
+ var data = mlContext.Data.LoadFromEnumerable(samples);
+
+ // Create training and validation sets.
+ var dataSets = mlContext.Data.TrainTestSplit(data);
+ var trainSet = dataSets.TrainSet;
+ var validSet = dataSets.TestSet;
+
+ // Create a GAM trainer.
+ // Use a small number of bins for this example. The setting below means for each feature,
+ // we divide its range into 16 discrete regions for the training process. Note that these
+ // regions are not evenly spaced, and that the final model may contain fewer bins, as
+ // neighboring bins with identical values will be combined. In general, we recommend using
+ // at least the default number of bins, as a small number of bins limits the capacity of
+ // the model.
+ // Also, change the pruning metrics to use the mean absolute error for pruning.
+ var trainer = mlContext.Regression.Trainers.Gam(
+ new GamRegressionTrainer.Options {
+ MaximumBinCountPerFeature = 16,
+ PruningMetrics = 1
+ });
+
+ // Fit the model using both of training and validation sets. GAM can use a technique called
+ // pruning to tune the model to the validation set after training to improve generalization.
+ var model = trainer.Fit(trainSet, validSet);
+
+ // Extract the model parameters.
+ var gam = model.Model;
+
+ // Now we can inspect the parameters of the Generalized Additive Model to understand the fit
+ // and potentially learn about our dataset.
+ // First, we will look at the bias; the bias represents the average prediction for the training data.
+ Console.WriteLine($"Average prediction: {gam.Bias:0.00}");
+
+ // Let's take a look at the features that the model built. Similar to a linear model, we have
+ // one response per feature. Unlike a linear model, this response is a function instead of a line.
+ // Each feature response represents the deviation from the average prediction as a function of the
+ // feature value.
+ for (int i = 0; i < gam.NumberOfShapeFunctions; i++)
{
- // The entropy (regularization) coefficient.
- EntropyCoefficient = 0.3,
- // Reduce the number of iterations to 50.
- NumberOfIterations = 50
- };
-
- // Define the trainer.
- var pipeline = mlContext.Regression.Trainers.Gam(options);
-
- // Train the model.
- var model = pipeline.Fit(trainingData);
+ // Break a line.
+ Console.WriteLine();
- // Create testing examples. Use different random seed to make it different from training data.
- var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed:123));
+ // Get the bin upper bounds for the feature.
+ var binUpperBounds = gam.GetBinUpperBounds(i);
- // Run the model on test data set.
- var transformedTestData = model.Transform(testData);
+ // Get the bin effects; these are the function values for each bin.
+ var binEffects = gam.GetBinEffects(i);
- // Convert IDataView object to a list.
- var predictions = mlContext.Data.CreateEnumerable(transformedTestData, reuseRowObject: false).ToList();
-
- // Look at 5 predictions
- foreach (var p in predictions.Take(5))
- Console.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}");
+ // Now, write the function to the console. The function is a set of bins, and the corresponding
+ // function values. You can think of GAMs as building a bar-chart or lookup table for each feature.
+ Console.WriteLine($"Feature{i}");
+ for (int j = 0; j < binUpperBounds.Count; j++)
+ Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}");
+ }
// Expected output:
- // Label: 0.985, Prediction: 0.841
- // Label: 0.155, Prediction: 0.187
- // Label: 0.515, Prediction: 0.496
- // Label: 0.566, Prediction: 0.467
- // Label: 0.096, Prediction: 0.144
+ // Average prediction: 1.33
+ //
+ // Feature0
+ // x < -0.44 => 0.128
+ // x < -0.38 => 0.066
+ // x < -0.32 => 0.040
+ // x < -0.26 => -0.006
+ // x < -0.20 => -0.035
+ // x < -0.13 => -0.050
+ // x < 0.06 => -0.077
+ // x < 0.12 => -0.075
+ // x < 0.18 => -0.052
+ // x < 0.25 => -0.031
+ // x < 0.31 => -0.002
+ // x < 0.37 => 0.040
+ // x < 0.44 => 0.083
+ // x < ∞ => 0.123
+ //
+ // Feature1
+ // x < 0.00 => -0.245
+ // x < 0.06 => 0.671
+ // x < 0.24 => 0.723
+ // x < 0.31 => -0.141
+ // x < 0.37 => -0.241
+ // x < ∞ => -0.248
+
+ // Let's consider this output. To score a given example, we look up the first bin where the inequality
+ // is satisfied for the feature value. We can look at the whole function to get a sense for how the
+ // model responds to the variable on a global level.
+ // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average
+ // expected output over the training set. Very few bins are used to model the second feature because the GAM model
+ // discards unchanged bins to create smaller models.
+ // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be
+ // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use
+ // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is
+ // real or just sampling noise. See for example:
+ // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model
+ // Distillation." arXiv:1710.06169."
+ }
- // Evaluate the overall metrics
- var metrics = mlContext.Regression.Evaluate(transformedTestData);
- Microsoft.ML.SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+// Feature0
+//x< -0.44 => 0.131
+//x< -0.38 => 0.067
+//x< -0.32 => 0.041
+//x< -0.26 => -0.005
+//x< -0.20 => -0.035
+//x< -0.13 => -0.050
+//x< -0.07 => -0.079
+//x< -0.01 => -0.083
+//x< 0.06 => -0.079
+//x< 0.12 => -0.075
+//x< 0.18 => -0.052
+//x< 0.25 => -0.030
+//x< 0.31 => -0.002
+//x< 0.37 => 0.041
+//x< 0.44 => 0.084
+//x< ∞ => 0.126
+
+//Feature1
+//x< -0.37 => -0.255
+//x< -0.25 => -0.247
+//x< 0.00 => -0.249
+//x< 0.06 => 0.671
+//x< 0.12 => 0.743
+//x< 0.24 => 0.746
+//x< 0.31 => -0.143
+//x< 0.37 => -0.245
+//x< 0.43 => -0.261
+//x< ∞ => -0.257
+
+ private class Data
+ {
+ public float Label { get; set; }
- // Expected output:
- // Mean Absolute Error: 0.06
- // Mean Squared Error: 0.01
- // Root Mean Squared Error: 0.08
- // RSquared: 0.93
+ [VectorType(2)]
+ public float[] Features { get; set; }
}
- private static IEnumerable GenerateRandomDataPoints(int count, int seed=0)
+ ///
+ /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0,
+ /// while Feature2 is a simple piecewise function.
+ ///
+ /// The number of examples to generate.
+ /// The seed for the random number generator used to produce data.
+ ///
+ private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1)
{
- var random = new Random(seed);
- float randomFloat() => (float)random.NextDouble();
- for (int i = 0; i < count; i++)
+ float bias = 1.0f;
+ var rng = new Random(seed);
+ float centeredFloat() => (float)(rng.NextDouble() - 0.5);
+ for (int i = 0; i < numExamples; i++)
{
- var label = randomFloat();
- yield return new DataPoint
+ // Generate random, uncoupled features.
+ var data = new Data
{
- Label = label,
- // Create random features that are correlated with label.
- Features = Enumerable.Repeat(label, 50).Select(x => x + randomFloat()).ToArray()
+ Features = new float[2] { centeredFloat(), centeredFloat() }
};
+ // Compute the label from the shape functions and add noise.
+ data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat();
+
+ yield return data;
}
}
- // Example with label and 50 feature values. A data set is a collection of such examples.
- private class DataPoint
- {
- public float Label { get; set; }
- [VectorType(50)]
- public float[] Features { get; set; }
- }
+ private static float Parabola(float x) => x * x;
- // Class used to capture predictions.
- private class Prediction
+ private static float SimplePiecewise(float x)
{
- // Original label.
- public float Label { get; set; }
- // Predicted score from the trainer.
- public float Score { get; set; }
+ if (x < 0)
+ return 0;
+ else if (x < 0.25)
+ return 1;
+ else
+ return 0;
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs
index 02ab57a7fa..7031a673da 100644
--- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs
+++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs
@@ -173,6 +173,13 @@ public static FastTreeRankingTrainer FastTree(this RankingCatalog.RankingTrainer
/// The number of iterations to use in learning the features.
/// The maximum number of bins to use to approximate features.
/// The learning rate. GAMs work best with a small learning rate.
+ ///
+ ///
+ ///
+ ///
+ ///
public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog,
string labelColumnName = DefaultColumnNames.Label,
string featureColumnName = DefaultColumnNames.Features,
@@ -191,6 +198,13 @@ public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassi
///
/// The .
/// Trainer options.
+ ///
+ ///
+ ///
+ ///
+ ///
public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog,
GamBinaryTrainer.Options options)
{
@@ -212,7 +226,7 @@ public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassi
///
///
///
///
///
@@ -237,7 +251,7 @@ public static GamRegressionTrainer Gam(this RegressionCatalog.RegressionTrainers
///
///
///
///
///