diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs new file mode 100644 index 0000000000..3b9c36644f --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Gam.cs @@ -0,0 +1,154 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic.Trainers.BinaryClassification +{ + public static class Gam + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create the dataset. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create training and validation sets. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. + var trainer = mlContext.BinaryClassification.Trainers.Gam(maximumBinCountPerFeature: 16); + + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. + var model = trainer.Fit(trainSet, validSet); + + // Extract the model parameters. + var gam = model.Model.SubModel; + + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); + + // Now look at the shape functions that the model has learned. Similar to a linear model, we have + // one response per feature, and they are independent. Unlike a linear model, this response is a + // generic function instead of a line. Because we have included a bias term, each feature response + // represents the deviation from the average prediction as a function of the feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + + // Get the bin effects; these are the function values for each bin. + var binEffects = gam.GetBinEffects(i); + + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } + + // Expected output: + // Average prediction: 0.82 + // + // Feature0 + // x < -0.44 => 0.286 + // x < -0.38 => 0.225 + // x < -0.32 => 0.048 + // x < -0.26 => -0.110 + // x < -0.20 => -0.116 + // x < 0.18 => -0.143 + // x < 0.25 => -0.115 + // x < 0.31 => -0.005 + // x < 0.37 => 0.097 + // x < 0.44 => 0.263 + // x < ∞ => 0.284 + // + // Feature1 + // x < 0.00 => -0.350 + // x < 0.24 => 0.875 + // x < 0.31 => -0.138 + // x < ∞ => -0.188 + + // Let's consider this output. To score a given example, we look up the first bin where the inequality + // is satisfied for the feature value. We can look at the whole function to get a sense for how the + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: + // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model + // Distillation." arXiv:1710.06169." + } + + private class Data + { + public bool Label { get; set; } + + [VectorType(2)] + public float[] Features { get; set; } + } + + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) + { + var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); + for (int i = 0; i < numExamples; i++) + { + // Generate random, uncoupled features. + var data = new Data { + Features = new float[2] { centeredFloat(), centeredFloat() } + }; + // Compute the label from the shape functions and add noise. + data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat()) > 0.5; + + yield return data; + } + } + + private static float Parabola(float x) => x * x; + + private static float SimplePiecewise(float x) + { + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; + } + + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs new file mode 100644 index 0000000000..e4a408a3ae --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/GamWithOptions.cs @@ -0,0 +1,163 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Trainers.BinaryClassification +{ + public static class GamWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(); + + // Create the dataset. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create training and validation datasets. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. + // Also, set the learning rate to half the default to slow down the gradient descent, and + // double the number of iterations to compensate. + var trainer = mlContext.BinaryClassification.Trainers.Gam( + new GamBinaryTrainer.Options { + NumberOfIterations = 19000, + MaximumBinCountPerFeature = 16, + LearningRate = 0.001 + }); + + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. + var model = trainer.Fit(trainSet, validSet); + + // Extract the model parameters. + var gam = model.Model.SubModel; + + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); + + // Now look at the shape functions that the model has learned. Similar to a linear model, we have + // one response per feature, and they are independent. Unlike a linear model, this response is a + // generic function instead of a line. Because we have included a bias term, each feature response + // represents the deviation from the average prediction as a function of the feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + + // Get the bin effects; these are the function values for each bin. + var binEffects = gam.GetBinEffects(i); + + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } + + // Expected output: + // Average prediction: 0.82 + // + // Feature0 + // x < -0.44 => 0.286 + // x < -0.38 => 0.225 + // x < -0.32 => 0.048 + // x < -0.26 => -0.110 + // x < -0.20 => -0.116 + // x < 0.18 => -0.143 + // x < 0.25 => -0.115 + // x < 0.31 => -0.005 + // x < 0.37 => 0.097 + // x < 0.44 => 0.263 + // x < ∞ => 0.284 + // + // Feature1 + // x < 0.00 => -0.350 + // x < 0.24 => 0.875 + // x < 0.31 => -0.138 + // x < ∞ => -0.188 + + // Let's consider this output. To score a given example, we look up the first bin where the inequality + // is satisfied for the feature value. We can look at the whole function to get a sense for how the + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: + // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model + // Distillation." arXiv:1710.06169." + } + + private class Data + { + public bool Label { get; set; } + + [VectorType(2)] + public float[] Features { get; set; } + } + + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) + { + var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); + for (int i = 0; i < numExamples; i++) + { + // Generate random, uncoupled features. + var data = new Data + { + Features = new float[2] { centeredFloat(), centeredFloat() } + }; + // Compute the label from the shape functions and add noise. + data.Label = Sigmoid(Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat()) > 0.5; + + yield return data; + } + } + + private static float Parabola(float x) => x * x; + + private static float SimplePiecewise(float x) + { + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; + } + + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs index dd8107452a..b070dfbda1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/Gam.cs @@ -1,7 +1,7 @@ using System; -using System.Linq; +using System.Collections.Generic; using Microsoft.ML; -using Microsoft.ML.SamplesUtils; +using Microsoft.ML.Data; namespace Samples.Dynamic.Trainers.Regression { @@ -14,93 +14,145 @@ public static void Example() // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); + + // Create the dataset. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create training and validation sets. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. + var trainer = mlContext.Regression.Trainers.Gam(maximumBinCountPerFeature: 16); + + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. + var model = trainer.Fit(trainSet, validSet); - // Read the Housing regression dataset - var data = DatasetUtils.LoadHousingRegressionDataset(mlContext); - - var labelName = "MedianHomeValue"; - var featureNames = data.Schema - .Select(column => column.Name) // Get the column names - .Where(name => name != labelName) // Drop the Label - .ToArray(); - - // Create a pipeline. - var pipeline = - // Concatenate the features to create a Feature vector. - mlContext.Transforms.Concatenate("Features", featureNames) - // Append a GAM regression trainer, setting the "MedianHomeValue" column as the label of the dataset, - // the "Features" column produced by concatenation as the features column, - // and use a small number of bins to make it easy to visualize in the console window. - // For real applications, it is recommended to start with the default number of bins. - .Append(mlContext.Regression.Trainers.Gam(labelColumnName: labelName, featureColumnName: "Features", maximumBinCountPerFeature: 16)); - - // Train the pipeline. - var trainedPipeline = pipeline.Fit(data); - - // Extract the model from the pipeline. - var gamModel = trainedPipeline.LastTransformer.Model; - - // Now investigate the bias and shape functions of the GAM model. - // The bias represents the average prediction for the training data. - Console.WriteLine($"Average predicted cost: {gamModel.Bias:0.00}"); + // Extract the model parameters. + var gam = model.Model; + + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); + + // Now look at the shape functions that the model has learned. Similar to a linear model, we have + // one response per feature, and they are independent. Unlike a linear model, this response is a + // generic function instead of a line. Because we have included a bias term, each feature response + // represents the deviation from the average prediction as a function of the feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) + { + // Break a line. + Console.WriteLine(); + + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); + + // Get the bin effects; these are the function values for each bin. + var binEffects = gam.GetBinEffects(i); + + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } // Expected output: - // Average predicted cost: 22.53 - - // Let's take a look at the features that the model built. Similar to a linear model, we have - // one response per feature. Unlike a linear model, this response is a function instead of a line. - // Each feature response represents the deviation from the average prediction as a function of the - // feature value. - - // Let's investigate the TeacherRatio variable. This is the ratio of students to teachers, - // so the higher it is, the more students a teacher has in their classroom. - // First, let's get the index of the variable we want to look at. - var studentTeacherRatioIndex = featureNames.ToList().FindIndex(str => str.Equals("TeacherRatio")); - - // Next, let's get the array of histogram bin upper bounds from the model for this feature. - // For each feature, the shape function is calculated at `MaxBins` locations along the range of - // values that the feature takes, and the resulting shape function can be seen as a histogram of - // effects. - var teacherRatioBinUpperBounds = gamModel.GetBinUpperBounds(studentTeacherRatioIndex); - // And the array of bin effects; these are the effect size for each bin. - var teacherRatioBinEffects = gamModel.GetBinEffects(studentTeacherRatioIndex); - - // Now, write the function to the console. The function is a set of bins, and the corresponding - // function values. You can think of GAMs as building a bar-chart lookup table. - Console.WriteLine("Student-Teacher Ratio"); - for (int i = 0; i < teacherRatioBinUpperBounds.Count; i++) - Console.WriteLine($"x < {teacherRatioBinUpperBounds[i]:0.00} => {teacherRatioBinEffects[i]:0.000}"); - - // Expected output: - // Student-Teacher Ratio - // x < 14.55 => 2.105 - // x < 14.75 => 2.326 - // x < 15.40 => 0.903 - // x < 16.50 => 0.651 - // x < 17.15 => 0.587 - // x < 17.70 => 0.624 - // x < 17.85 => 0.684 - // x < 18.35 => -0.315 - // x < 18.55 => -0.542 - // x < 18.75 => -0.083 - // x < 19.40 => -0.442 - // x < 20.55 => -0.649 - // x < 21.05 => -1.579 - // x < ∞ => 0.318 + // Average prediction: 1.33 + // + // Feature0 + // x < -0.44 => 0.128 + // x < -0.38 => 0.066 + // x < -0.32 => 0.040 + // x < -0.26 => -0.006 + // x < -0.20 => -0.035 + // x < -0.13 => -0.050 + // x < 0.06 => -0.077 + // x < 0.12 => -0.075 + // x < 0.18 => -0.052 + // x < 0.25 => -0.031 + // x < 0.31 => -0.002 + // x < 0.37 => 0.040 + // x < 0.44 => 0.083 + // x < ∞ => 0.123 + // + // Feature1 + // x < 0.00 => -0.245 + // x < 0.06 => 0.671 + // x < 0.24 => 0.723 + // x < 0.31 => -0.141 + // x < 0.37 => -0.241 + // x < ∞ => -0.248 // Let's consider this output. To score a given example, we look up the first bin where the inequality // is satisfied for the feature value. We can look at the whole function to get a sense for how the - // model responds to the variable on a global level. For the student-teacher-ratio variable, we can see - // that smaller class sizes are predictive of a higher house value, while student-teacher ratios higher - // than about 18 lead to lower predictions in house value. This makes intuitive sense, as smaller class - // sizes are desirable and also indicative of better-funded schools, which could make buyers likely to - // pay more for the house. - - // Another thing to notice is that these feature functions can be noisy. See student-teacher ratios > 21.05. - // Common practice is to use resampling methods to estimate a confidence interval at each bin. This will - // help to determine if the effect is real or just sampling noise. See for example + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model // Distillation." arXiv:1710.06169." } + + private class Data + { + public float Label { get; set; } + + [VectorType(2)] + public float[] Features { get; set; } + } + + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) + { + float bias = 1.0f; + var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); + for (int i = 0; i < numExamples; i++) + { + // Generate random, uncoupled features. + var data = new Data { + Features = new float[2] { centeredFloat(), centeredFloat() } + }; + // Compute the label from the shape functions and add noise. + data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat(); + + yield return data; + } + } + + private static float Parabola(float x) => x * x; + + private static float SimplePiecewise(float x) + { + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; + } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs index 33617b2d94..6c973814fd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/GamWithOptions.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Trainers.FastTree; @@ -15,92 +14,182 @@ public static void Example() { // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. - // Setting the seed to a fixed number in this example to make outputs deterministic. - var mlContext = new MLContext(seed: 0); - - // Create a list of training examples. - var examples = GenerateRandomDataPoints(1000); - - // Convert the examples list to an IDataView object, which is consumable by ML.NET API. - var trainingData = mlContext.Data.LoadFromEnumerable(examples); - - // Define trainer options. - var options = new GamRegressionTrainer.Options + var mlContext = new MLContext(); + + // Create the dataset. + var samples = GenerateData(); + + // Convert the dataset to an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create training and validation sets. + var dataSets = mlContext.Data.TrainTestSplit(data); + var trainSet = dataSets.TrainSet; + var validSet = dataSets.TestSet; + + // Create a GAM trainer. + // Use a small number of bins for this example. The setting below means for each feature, + // we divide its range into 16 discrete regions for the training process. Note that these + // regions are not evenly spaced, and that the final model may contain fewer bins, as + // neighboring bins with identical values will be combined. In general, we recommend using + // at least the default number of bins, as a small number of bins limits the capacity of + // the model. + // Also, change the pruning metrics to use the mean absolute error for pruning. + var trainer = mlContext.Regression.Trainers.Gam( + new GamRegressionTrainer.Options { + MaximumBinCountPerFeature = 16, + PruningMetrics = 1 + }); + + // Fit the model using both of training and validation sets. GAM can use a technique called + // pruning to tune the model to the validation set after training to improve generalization. + var model = trainer.Fit(trainSet, validSet); + + // Extract the model parameters. + var gam = model.Model; + + // Now we can inspect the parameters of the Generalized Additive Model to understand the fit + // and potentially learn about our dataset. + // First, we will look at the bias; the bias represents the average prediction for the training data. + Console.WriteLine($"Average prediction: {gam.Bias:0.00}"); + + // Let's take a look at the features that the model built. Similar to a linear model, we have + // one response per feature. Unlike a linear model, this response is a function instead of a line. + // Each feature response represents the deviation from the average prediction as a function of the + // feature value. + for (int i = 0; i < gam.NumberOfShapeFunctions; i++) { - // The entropy (regularization) coefficient. - EntropyCoefficient = 0.3, - // Reduce the number of iterations to 50. - NumberOfIterations = 50 - }; - - // Define the trainer. - var pipeline = mlContext.Regression.Trainers.Gam(options); - - // Train the model. - var model = pipeline.Fit(trainingData); + // Break a line. + Console.WriteLine(); - // Create testing examples. Use different random seed to make it different from training data. - var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed:123)); + // Get the bin upper bounds for the feature. + var binUpperBounds = gam.GetBinUpperBounds(i); - // Run the model on test data set. - var transformedTestData = model.Transform(testData); + // Get the bin effects; these are the function values for each bin. + var binEffects = gam.GetBinEffects(i); - // Convert IDataView object to a list. - var predictions = mlContext.Data.CreateEnumerable(transformedTestData, reuseRowObject: false).ToList(); - - // Look at 5 predictions - foreach (var p in predictions.Take(5)) - Console.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}"); + // Now, write the function to the console. The function is a set of bins, and the corresponding + // function values. You can think of GAMs as building a bar-chart or lookup table for each feature. + Console.WriteLine($"Feature{i}"); + for (int j = 0; j < binUpperBounds.Count; j++) + Console.WriteLine($"x < {binUpperBounds[j]:0.00} => {binEffects[j]:0.000}"); + } // Expected output: - // Label: 0.985, Prediction: 0.841 - // Label: 0.155, Prediction: 0.187 - // Label: 0.515, Prediction: 0.496 - // Label: 0.566, Prediction: 0.467 - // Label: 0.096, Prediction: 0.144 + // Average prediction: 1.33 + // + // Feature0 + // x < -0.44 => 0.128 + // x < -0.38 => 0.066 + // x < -0.32 => 0.040 + // x < -0.26 => -0.006 + // x < -0.20 => -0.035 + // x < -0.13 => -0.050 + // x < 0.06 => -0.077 + // x < 0.12 => -0.075 + // x < 0.18 => -0.052 + // x < 0.25 => -0.031 + // x < 0.31 => -0.002 + // x < 0.37 => 0.040 + // x < 0.44 => 0.083 + // x < ∞ => 0.123 + // + // Feature1 + // x < 0.00 => -0.245 + // x < 0.06 => 0.671 + // x < 0.24 => 0.723 + // x < 0.31 => -0.141 + // x < 0.37 => -0.241 + // x < ∞ => -0.248 + + // Let's consider this output. To score a given example, we look up the first bin where the inequality + // is satisfied for the feature value. We can look at the whole function to get a sense for how the + // model responds to the variable on a global level. + // The model can be seen to reconstruct the parabolic and step-wise function, shifted with respect to the average + // expected output over the training set. Very few bins are used to model the second feature because the GAM model + // discards unchanged bins to create smaller models. + // One last thing to notice is that these feature functions can be noisy. While we know that Feature1 should be + // symmetric, this is not captured in the model. This is due to noise in the data. Common practice is to use + // resampling methods to estimate a confidence interval at each bin. This will help to determine if the effect is + // real or just sampling noise. See for example: + // Tan, Caruana, Hooker, and Lou. "Distill-and-Compare: Auditing Black-Box Models Using Transparent Model + // Distillation." arXiv:1710.06169." + } - // Evaluate the overall metrics - var metrics = mlContext.Regression.Evaluate(transformedTestData); - Microsoft.ML.SamplesUtils.ConsoleUtils.PrintMetrics(metrics); +// Feature0 +//x< -0.44 => 0.131 +//x< -0.38 => 0.067 +//x< -0.32 => 0.041 +//x< -0.26 => -0.005 +//x< -0.20 => -0.035 +//x< -0.13 => -0.050 +//x< -0.07 => -0.079 +//x< -0.01 => -0.083 +//x< 0.06 => -0.079 +//x< 0.12 => -0.075 +//x< 0.18 => -0.052 +//x< 0.25 => -0.030 +//x< 0.31 => -0.002 +//x< 0.37 => 0.041 +//x< 0.44 => 0.084 +//x< ∞ => 0.126 + +//Feature1 +//x< -0.37 => -0.255 +//x< -0.25 => -0.247 +//x< 0.00 => -0.249 +//x< 0.06 => 0.671 +//x< 0.12 => 0.743 +//x< 0.24 => 0.746 +//x< 0.31 => -0.143 +//x< 0.37 => -0.245 +//x< 0.43 => -0.261 +//x< ∞ => -0.257 + + private class Data + { + public float Label { get; set; } - // Expected output: - // Mean Absolute Error: 0.06 - // Mean Squared Error: 0.01 - // Root Mean Squared Error: 0.08 - // RSquared: 0.93 + [VectorType(2)] + public float[] Features { get; set; } } - private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + /// + /// Creates a dataset, an IEnumerable of Data objects, for a GAM sample. Feature1 is a parabola centered around 0, + /// while Feature2 is a simple piecewise function. + /// + /// The number of examples to generate. + /// The seed for the random number generator used to produce data. + /// + private static IEnumerable GenerateData(int numExamples = 25000, int seed = 1) { - var random = new Random(seed); - float randomFloat() => (float)random.NextDouble(); - for (int i = 0; i < count; i++) + float bias = 1.0f; + var rng = new Random(seed); + float centeredFloat() => (float)(rng.NextDouble() - 0.5); + for (int i = 0; i < numExamples; i++) { - var label = randomFloat(); - yield return new DataPoint + // Generate random, uncoupled features. + var data = new Data { - Label = label, - // Create random features that are correlated with label. - Features = Enumerable.Repeat(label, 50).Select(x => x + randomFloat()).ToArray() + Features = new float[2] { centeredFloat(), centeredFloat() } }; + // Compute the label from the shape functions and add noise. + data.Label = bias + Parabola(data.Features[0]) + SimplePiecewise(data.Features[1]) + centeredFloat(); + + yield return data; } } - // Example with label and 50 feature values. A data set is a collection of such examples. - private class DataPoint - { - public float Label { get; set; } - [VectorType(50)] - public float[] Features { get; set; } - } + private static float Parabola(float x) => x * x; - // Class used to capture predictions. - private class Prediction + private static float SimplePiecewise(float x) { - // Original label. - public float Label { get; set; } - // Predicted score from the trainer. - public float Score { get; set; } + if (x < 0) + return 0; + else if (x < 0.25) + return 1; + else + return 0; } } -} \ No newline at end of file +} diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 02ab57a7fa..7031a673da 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -173,6 +173,13 @@ public static FastTreeRankingTrainer FastTree(this RankingCatalog.RankingTrainer /// The number of iterations to use in learning the features. /// The maximum number of bins to use to approximate features. /// The learning rate. GAMs work best with a small learning rate. + /// + /// + /// + /// + /// public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, string labelColumnName = DefaultColumnNames.Label, string featureColumnName = DefaultColumnNames.Features, @@ -191,6 +198,13 @@ public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassi /// /// The . /// Trainer options. + /// + /// + /// + /// + /// public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, GamBinaryTrainer.Options options) { @@ -212,7 +226,7 @@ public static GamBinaryTrainer Gam(this BinaryClassificationCatalog.BinaryClassi /// /// /// /// /// @@ -237,7 +251,7 @@ public static GamRegressionTrainer Gam(this RegressionCatalog.RegressionTrainers /// /// /// /// ///