From 75105b2fcd7efbaec1575ad8b15d7ae5e4b61f3d Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 5 Apr 2019 18:26:09 +0200 Subject: [PATCH 1/7] samples custom, missingindicator, missingreplace --- .../Dynamic/Transforms/CustomMappingSample.cs | 116 ++++++++++------- .../CustomMappingSampleSaveAndLoad.cs | 119 ++++++++++++++++++ .../Transforms/IndicateMissingValues.cs | 1 - .../IndicateMissingValuesMultiColumn.cs | 83 ++++++++++++ .../Transforms/ReplaceMissingValues.cs | 2 +- .../ReplaceMissingValuesMultiColumn.cs | 119 ++++++++++++++++++ .../ComponentModel/ComponentCatalog.cs | 3 +- .../CustomMappingCatalog.cs | 1 + .../ExtensionsCatalog.cs | 12 ++ 9 files changed, 405 insertions(+), 51 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs index d4b5b5904e..b02c31bed2 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs @@ -1,4 +1,7 @@ using System; +using System.Collections.Generic; +using Microsoft.ML.Transforms; + namespace Microsoft.ML.Samples.Dynamic { public static class CustomMapping @@ -10,71 +13,88 @@ public static void Example() var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and convert it to an IDataView. - var data = SamplesUtils.DatasetUtils.GetInfertData(); - var trainData = mlContext.Data.LoadFromEnumerable(data); + var rawData = GetData(); + + // Printing the input data. + Console.WriteLine("Age\t Salary"); + foreach (var row in rawData) + Console.WriteLine($"{row.Age}\t {row.Salary}"); + // Expected output: + // Age Salary + // 26 40000 + // 35 80000 + // 34 10000 + // 28 100000 - // Preview of the data. - // Age RowNum Education ... - // 26 0 0-5yrs ... - // 42 1 0-5yrs ... - // 39 2 12+yrs ... - // 34 3 0-5yrs ... - // 35 4 6-11yrs ... + var data = mlContext.Data.LoadFromEnumerable(rawData); // We define the custom mapping between input and output rows that will be applied by the transformation. - Action mapping = + Action mapping = (input, output) => output.IsUnderThirty = input.Age < 30; - // Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly. - var estimator = mlContext.Transforms.CustomMapping(mapping, null); - var transformedData = estimator.Fit(trainData).Transform(trainData); + // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. + // Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, cannot be saved and loaded back. + // See other sample on how to load and save the CustomMapping estimator. + var estimator = mlContext.Transforms.CustomMapping(mapping, contractName: null); + var transformedData = estimator.Fit(data).Transform(data); - // Preview 5 lines of the transformed data. - transformedData = mlContext.Data.TakeRows(transformedData, 5); - var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); - Console.WriteLine("IsUnderThirty\t Age\t RowNum\t Education\t ..."); + // Printing the output data. + var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); + Console.WriteLine("Age\t Salary\t IsUnderThirty"); foreach (var row in dataEnumerable) - Console.WriteLine($"{row.IsUnderThirty}\t {row.Age}\t {row.RowNum}\t {row.Education}\t ..."); + Console.WriteLine($"{row.Age}\t {row.Salary}\t {row.IsUnderThirty}"); // Expected output: - // IsUnderThirty Age RowNum Education ... - // True 26 0 0-5yrs ... - // False 42 1 0-5yrs ... - // False 39 2 12+yrs ... - // False 34 3 0-5yrs ... - // False 35 4 6-11yrs ... - + // Age Salary IsUnderThirty + // 26 40000 True + // 35 80000 False + // 34 10000 False + // 28 100000 True + } - // Here instead we use it as part of a pipeline of estimators. - var pipeline = mlContext.Transforms.CustomMapping(mapping, null) - .Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", inputColumnNames: new[] { "Parity", "Induced" })) - // It is useful to add a caching checkpoint before a trainer that does several passes over the data. - .AppendCacheCheckpoint(mlContext) - // We use binary FastTree to predict the label column that was generated by the custom mapping at the first step of the pipeline. - .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "IsUnderThirty")); + // Defines only the column to be generated by the custom mapping transformation in addition to the columns already present. + public class CustomMappingOutput + { + public bool IsUnderThirty { get; set; } + } - // We can train the pipeline and use it to transform data. - transformedData = pipeline.Fit(trainData).Transform(trainData); + // Defines the schema of the input data. + public class InputData + { + public float Age { get; set; } + public float Salary { get; set; } } - // This defines only the column to be generated by the transformation in addition to the columns already present. - public class OutputRow + // Defines the schema of the transformed data, which includes the new column IsUnderThirty. + public class TransformedData { + public float Age { get; set; } + public float Salary { get; set; } public bool IsUnderThirty { get; set; } + } - // Represents the transformed infertility dataset. - public class SampleInfertDataTransformed + // Returns an enumerable of input rows. + public static IEnumerable GetData() { - public bool IsUnderThirty { get; set; } - public float Age { get; set; } - public int RowNum { get; set; } - public string Education { get; set; } - public float Parity { get; set; } - public float Induced { get; set; } - public float Case { get; set; } - public float Spontaneous { get; set; } - public float Stratum { get; set; } - public float PooledStratum { get; set; } + return new List + { + new InputData { + Age = 26, + Salary = 40000, + }, + new InputData { + Age = 35, + Salary = 80000, + }, + new InputData { + Age = 34, + Salary = 10000, + }, + new InputData { + Age = 28, + Salary = 100000, + }, + }; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs new file mode 100644 index 0000000000..d93cdce46f --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs @@ -0,0 +1,119 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Transforms; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class CustomMappingSaveAndLoad + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. + var rawData = GetData(); + + // Printing the input data. + Console.WriteLine("Age\t Salary"); + foreach (var row in rawData) + Console.WriteLine($"{row.Age}\t {row.Salary}"); + // Expected output: + // Age Salary + // 26 40000 + // 35 80000 + // 34 10000 + // 28 100000 + + var data = mlContext.Data.LoadFromEnumerable(rawData); + + // Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly. + var estimator = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty"); + var transform = estimator.Fit(data); + + // To save and load the CustomMapping estimator, the assembly in which the custom action is defined needs to be registered in the + // environment. The following registers the assembly where IsUnderThirtyCustomAction is defined. + mlContext.ComponentCatalog.RegisterAssembly(typeof(IsUnderThirtyCustomAction).Assembly); + + // Now the transform pipeline can be saved and loaded through the usual MLCOntext method. + mlContext.Model.Save(transform, data.Schema, "customTransform.zip"); + var loadedTransform = mlContext.Model.Load("customTransform.zip", out var inputSchema); + + // Transform the data using the CustomMapping transform that was saved and loaded. + var transformedData = loadedTransform.Transform(data); + + // Printing the output data. + var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); + Console.WriteLine("Age\t Salary\t IsUnderThirty"); + foreach (var row in dataEnumerable) + Console.WriteLine($"{row.Age}\t {row.Salary}\t {row.IsUnderThirty}"); + // Expected output: + // Age Salary IsUnderThirty + // 26 40000 True + // 35 80000 False + // 34 10000 False + // 28 100000 True + } + + // The custom action needs to implement the abstract class CustomMappingFactory, and needs to have attribute + // CustomMappingFactoryAttribute with argument equal to the contractName used to define the CustomMapping estimator + // which uses the action. + [CustomMappingFactoryAttribute("IsUnderThirty")] + public class IsUnderThirtyCustomAction : CustomMappingFactory + { + // We define the custom mapping between input and output rows that will be applied by the transformation. + public static void CustomAction(InputData input, CustomMappingOutput output) + => output.IsUnderThirty = input.Age < 30; + + public override Action GetMapping() + => CustomAction; + } + + // Defines only the column to be generated by the custom mapping transformation in addition to the columns already present. + public class CustomMappingOutput + { + public bool IsUnderThirty { get; set; } + } + + // Defines the schema of the input data. + public class InputData + { + public float Age { get; set; } + public float Salary { get; set; } + } + + // Defines the schema of the transformed data, which includes the new column IsUnderThirty. + public class TransformedData + { + public float Age { get; set; } + public float Salary { get; set; } + public bool IsUnderThirty { get; set; } + + } + + // Returns an enumerable of input rows. + public static IEnumerable GetData() + { + return new List + { + new InputData { + Age = 26, + Salary = 40000, + }, + new InputData { + Age = 35, + Salary = 80000, + }, + new InputData { + Age = 34, + Salary = 10000, + }, + new InputData { + Age = 28, + Salary = 100000, + }, + }; + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs index 15d448deee..796aa995c1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -7,7 +7,6 @@ namespace Microsoft.ML.Samples.Dynamic { public static class IndicateMissingValues { - public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs new file mode 100644 index 0000000000..4a0011f5fd --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs @@ -0,0 +1,83 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class IndicateMissingValuesMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + var samples = new List() + { + new DataPoint(){ Label = 3, Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, + new DataPoint(){ Label = 32, Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {float.NaN, 1} }, + new DataPoint(){ Label = float.NaN, Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {1, float.PositiveInfinity} }, + }; + // Convert training data to IDataView, the general data type used in ML.NET. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // IndicateMissingValues is used to create a boolean containing + // 'true' where the value in the input column is NaN. This value can be used + // to replace missing values with other values. We can use an array of InputOutputColumnPair + // to apply the MissingValueIndicatorEstimator to multiple columns in one pass over the data. + IEstimator pipeline = mlContext.Transforms.IndicateMissingValues(new[] { + new InputOutputColumnPair("MissingIndicator1", "Features1"), + new InputOutputColumnPair("MissingIndicator2", "Features2") + }); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var tansformer = pipeline.Fit(data); + var transformedData = tansformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); + + // a small printing utility + Func vectorPrinter = (object[] vector) => + { + string preview = "["; + foreach (var slot in vector) + preview += $"{slot} "; + return preview += "]"; + + }; + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in rowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast().ToArray())} " + + $"Features2: {vectorPrinter(row.Features2.Cast().ToArray())} " + + $"MissingIndicator1: {vectorPrinter(row.MissingIndicator1.Cast().ToArray())} " + + $"MissingIndicator2: {vectorPrinter(row.MissingIndicator2.Cast().ToArray())}"); + } + + // Expected output: + // Label: 3 Features1: [1 1 0] Features2: [1 1] MissingIndicator1: [False False False] MissingIndicator2: [False False] + // Label: 32 Features1: [0 NaN 1] Features2: [NaN 1] MissingIndicator1: [False True False] MissingIndicator2: [True False] + // Label: NaN Features1: [-1 NaN -3 ] Features2: [1 ∞ ] MissingIndicator1: [False True False] MissingIndicator2: [False False] + } + + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features1 { get; set; } + [VectorType(2)] + public float[] Features2 { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + public bool[] MissingIndicator1 { get; set; } + public bool[] MissingIndicator2 { get; set; } + + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs index 01fce1ad06..dbe433636f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -18,7 +18,7 @@ public static void Example() new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, - new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, + new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, }; // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs new file mode 100644 index 0000000000..c851254f43 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs @@ -0,0 +1,119 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms; + +namespace Microsoft.ML.Samples.Dynamic +{ + class ReplaceMissingValuesMultiColumn + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + var samples = new List() + { + new DataPoint(){ Label = 3, Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, + new DataPoint(){ Label = 32, Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {0, 1} }, + new DataPoint(){ Label = 5, Features1 = new float[3] {-1, 2, -3}, Features2 = new float[2] {-1, float.NaN} }, + new DataPoint(){ Label = 9, Features1 = new float[3] {-1, 6, -3}, Features2 = new float[2] {0, float.PositiveInfinity} }, + }; + // Convert training data to IDataView, the general data type used in ML.NET. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { + new InputOutputColumnPair("MissingReplaced1", "Features1"), + new InputOutputColumnPair("MissingReplaced2", "Features2") + }, + MissingValueReplacingEstimator.ReplacementMode.Mean); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); + + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { + new InputOutputColumnPair("MissingReplaced1", "Features1"), + new InputOutputColumnPair("MissingReplaced2", "Features2") + }, + MissingValueReplacingEstimator.ReplacementMode.DefaultValue); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var defaultTransformer = defaultPipeline.Fit(data); + var defaultTransformedData = defaultTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); + + // a small printing utility + Func vectorPrinter = (object[] vector) => + { + string preview = "["; + foreach (var slot in vector) + preview += $"{slot} "; + return preview += "]"; + + }; + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in meanRowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast().ToArray())}" + + $" Features2: {vectorPrinter(row.Features2.Cast().ToArray())} " + + $" MissingReplaced1: {vectorPrinter(row.MissingReplaced1.Cast().ToArray())}" + + $" MissingReplaced2: {vectorPrinter(row.MissingReplaced2.Cast().ToArray())}"); + } + + // Expected output: + // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row + // + // Label: 3 Features1: [1 1 0] Features2: [1 1] MissingReplaced1: [1 1 0] MissingReplaced2: [1 1] + // Label: 32 Features1: [0 NaN 1] Features2: [0 1] MissingReplaced1: [0 3 1] MissingReplaced2: [0 1] + // Label: 5 Features1: [-1 2 - 3] Features2: [-1 NaN] MissingReplaced1: [-1 2 - 3] MissingReplaced2: [-1 1] + // Label: 9 Features1: [-1 6 - 3] Features2: [0 ∞ ] MissingReplaced1: [-1 6 - 3] MissingReplaced2: [0 ∞ ] + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in defaultRowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast().ToArray())}" + + $" Features2: {vectorPrinter(row.Features2.Cast().ToArray())} " + + $" MissingReplaced1: {vectorPrinter(row.MissingReplaced1.Cast().ToArray())}" + + $"MissingReplaced2: {vectorPrinter(row.MissingReplaced2.Cast().ToArray())}"); + } + + // Expected output: + // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. + // + // Label: 3 Features1: [1 1 0] Features2: [1 1] MissingReplaced1: [1 1 0]MissingReplaced2: [1 1] + // Label: 32 Features1: [0 NaN 1] Features2: [0 1] MissingReplaced1: [0 0 1]MissingReplaced2: [0 1] + // Label: 5 Features1: [-1 2 - 3] Features2: [-1 NaN] MissingReplaced1: [-1 2 - 3]MissingReplaced2: [-1 0] + // Label: 9 Features1: [-1 6 - 3] Features2: [0 ∞ ] MissingReplaced1: [-1 6 - 3]MissingReplaced2: [0 ∞ ] + } + + private class DataPoint + { + public float Label { get; set; } + + [VectorType(3)] + public float[] Features1 { get; set; } + [VectorType(2)] + public float[] Features2 { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + [VectorType(3)] + public float[] MissingReplaced1 { get; set; } + [VectorType(2)] + public float[] MissingReplaced2 { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs b/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs index b8d282a5a3..cb6978ddfa 100644 --- a/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs +++ b/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs @@ -1020,11 +1020,12 @@ private void LoadExtensions(Assembly assembly, bool throwOnError) /// Gets a value indicating whether can contain extensions. /// /// - /// All ML.NET product assemblies won't contain extensions. + /// All ML.NET product assemblies won't contain extensions besides Microsoft.ML.Samples. /// private static bool CanContainExtensions(Assembly assembly) { if (assembly.FullName.StartsWith("Microsoft.ML.", StringComparison.Ordinal) + && !assembly.FullName.StartsWith("Microsoft.ML.Samples", StringComparison.Ordinal) && HasMLNetPublicKey(assembly)) { return false; diff --git a/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs b/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs index a3f9320e30..b2ce59c0e6 100644 --- a/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs +++ b/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs @@ -27,6 +27,7 @@ public static class CustomMappingCatalog /// /// /// public static CustomMappingEstimator CustomMapping(this TransformsCatalog catalog, Action mapAction, string contractName, diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index 30685d8067..a2e0c2c57c 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -38,6 +38,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// /// The transform extensions' catalog. /// Specifies the names of the columns on which to apply the transformation. + /// + /// + /// + /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns) { var env = CatalogUtils.GetEnvironment(catalog); @@ -83,6 +89,12 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform /// If true, per-slot imputation of replacement is performed. /// Otherwise, replacement value is imputed for the entire vector column. This setting is ignored for scalars and variable vectors, /// where imputation is always for the entire column. + /// + /// + /// + /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns, MissingValueReplacingEstimator.ReplacementMode replacementMode = MissingValueReplacingEstimator.Defaults.Mode, From 0bf25ee329f07a179c4e5d8e0c42003eccd0b9e6 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 8 Apr 2019 15:15:47 -0700 Subject: [PATCH 2/7] review comments --- docs/samples/Directory.Build.props | 6 ++ .../Dynamic/Transforms/CustomMappingSample.cs | 79 +++++----------- ...AndLoad.cs => CustomMappingSaveAndLoad.cs} | 83 ++++++----------- .../Transforms/IndicateMissingValues.cs | 41 +++------ .../IndicateMissingValuesMultiColumn.cs | 45 +++------ .../Transforms/ReplaceMissingValues.cs | 80 +++++++--------- .../ReplaceMissingValuesMultiColumn.cs | 92 +++++++------------ .../ComponentModel/ComponentCatalog.cs | 3 +- .../CustomMappingCatalog.cs | 4 +- .../ExtensionsCatalog.cs | 8 +- 10 files changed, 157 insertions(+), 284 deletions(-) create mode 100644 docs/samples/Directory.Build.props rename docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/{CustomMappingSampleSaveAndLoad.cs => CustomMappingSaveAndLoad.cs} (56%) diff --git a/docs/samples/Directory.Build.props b/docs/samples/Directory.Build.props new file mode 100644 index 0000000000..2d9fe63b48 --- /dev/null +++ b/docs/samples/Directory.Build.props @@ -0,0 +1,6 @@ + + + + $(ToolsDir)Samples.snk + + \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs index b02c31bed2..ffec87ffad 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs @@ -1,8 +1,8 @@ using System; using System.Collections.Generic; -using Microsoft.ML.Transforms; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class CustomMapping { @@ -13,20 +13,14 @@ public static void Example() var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and convert it to an IDataView. - var rawData = GetData(); - - // Printing the input data. - Console.WriteLine("Age\t Salary"); - foreach (var row in rawData) - Console.WriteLine($"{row.Age}\t {row.Salary}"); - // Expected output: - // Age Salary - // 26 40000 - // 35 80000 - // 34 10000 - // 28 100000 - - var data = mlContext.Data.LoadFromEnumerable(rawData); + var samples = new List + { + new InputData { Age = 26 }, + new InputData { Age = 35 }, + new InputData { Age = 34 }, + new InputData { Age = 28 }, + }; + var data = mlContext.Data.LoadFromEnumerable(samples); // We define the custom mapping between input and output rows that will be applied by the transformation. Action mapping = @@ -35,66 +29,39 @@ public static void Example() // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. // Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, cannot be saved and loaded back. // See other sample on how to load and save the CustomMapping estimator. - var estimator = mlContext.Transforms.CustomMapping(mapping, contractName: null); - var transformedData = estimator.Fit(data).Transform(data); + var pipeline = mlContext.Transforms.CustomMapping(mapping, contractName: null); + var transformedData = pipeline.Fit(data).Transform(data); // Printing the output data. var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); - Console.WriteLine("Age\t Salary\t IsUnderThirty"); + Console.WriteLine("Age\t IsUnderThirty"); foreach (var row in dataEnumerable) - Console.WriteLine($"{row.Age}\t {row.Salary}\t {row.IsUnderThirty}"); + Console.WriteLine($"{row.Age}\t {row.IsUnderThirty}"); + // Expected output: - // Age Salary IsUnderThirty - // 26 40000 True - // 35 80000 False - // 34 10000 False - // 28 100000 True + // Age IsUnderThirty + // 26 True + // 35 False + // 34 False + // 28 True } // Defines only the column to be generated by the custom mapping transformation in addition to the columns already present. - public class CustomMappingOutput + private class CustomMappingOutput { public bool IsUnderThirty { get; set; } } // Defines the schema of the input data. - public class InputData + private class InputData { public float Age { get; set; } - public float Salary { get; set; } } // Defines the schema of the transformed data, which includes the new column IsUnderThirty. - public class TransformedData + private class TransformedData : InputData { - public float Age { get; set; } - public float Salary { get; set; } public bool IsUnderThirty { get; set; } - - } - - // Returns an enumerable of input rows. - public static IEnumerable GetData() - { - return new List - { - new InputData { - Age = 26, - Salary = 40000, - }, - new InputData { - Age = 35, - Salary = 80000, - }, - new InputData { - Age = 34, - Salary = 10000, - }, - new InputData { - Age = 28, - Salary = 100000, - }, - }; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs similarity index 56% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs index d93cdce46f..b007154e75 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSampleSaveAndLoad.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; +using Microsoft.ML; using Microsoft.ML.Transforms; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class CustomMappingSaveAndLoad { @@ -13,30 +14,24 @@ public static void Example() var mlContext = new MLContext(); // Get a small dataset as an IEnumerable and convert it to an IDataView. - var rawData = GetData(); - - // Printing the input data. - Console.WriteLine("Age\t Salary"); - foreach (var row in rawData) - Console.WriteLine($"{row.Age}\t {row.Salary}"); - // Expected output: - // Age Salary - // 26 40000 - // 35 80000 - // 34 10000 - // 28 100000 - - var data = mlContext.Data.LoadFromEnumerable(rawData); + var samples = new List + { + new InputData { Age = 26 }, + new InputData { Age = 35 }, + new InputData { Age = 34 }, + new InputData { Age = 28 }, + }; + var data = mlContext.Data.LoadFromEnumerable(samples); - // Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly. - var estimator = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty"); - var transform = estimator.Fit(data); + // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. + var pipeline = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty"); + var transform = pipeline.Fit(data); // To save and load the CustomMapping estimator, the assembly in which the custom action is defined needs to be registered in the // environment. The following registers the assembly where IsUnderThirtyCustomAction is defined. mlContext.ComponentCatalog.RegisterAssembly(typeof(IsUnderThirtyCustomAction).Assembly); - // Now the transform pipeline can be saved and loaded through the usual MLCOntext method. + // Now the transform pipeline can be saved and loaded through the usual MLContext method. mlContext.Model.Save(transform, data.Schema, "customTransform.zip"); var loadedTransform = mlContext.Model.Load("customTransform.zip", out var inputSchema); @@ -45,22 +40,23 @@ public static void Example() // Printing the output data. var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); - Console.WriteLine("Age\t Salary\t IsUnderThirty"); + Console.WriteLine("Age\tIsUnderThirty"); foreach (var row in dataEnumerable) - Console.WriteLine($"{row.Age}\t {row.Salary}\t {row.IsUnderThirty}"); + Console.WriteLine($"{row.Age}\t {row.IsUnderThirty}"); + // Expected output: - // Age Salary IsUnderThirty - // 26 40000 True - // 35 80000 False - // 34 10000 False - // 28 100000 True + // Age IsUnderThirty + // 26 True + // 35 False + // 34 False + // 28 True } // The custom action needs to implement the abstract class CustomMappingFactory, and needs to have attribute // CustomMappingFactoryAttribute with argument equal to the contractName used to define the CustomMapping estimator // which uses the action. [CustomMappingFactoryAttribute("IsUnderThirty")] - public class IsUnderThirtyCustomAction : CustomMappingFactory + private class IsUnderThirtyCustomAction : CustomMappingFactory { // We define the custom mapping between input and output rows that will be applied by the transformation. public static void CustomAction(InputData input, CustomMappingOutput output) @@ -71,49 +67,22 @@ public override Action GetMapping() } // Defines only the column to be generated by the custom mapping transformation in addition to the columns already present. - public class CustomMappingOutput + private class CustomMappingOutput { public bool IsUnderThirty { get; set; } } // Defines the schema of the input data. - public class InputData + private class InputData { public float Age { get; set; } - public float Salary { get; set; } } // Defines the schema of the transformed data, which includes the new column IsUnderThirty. - public class TransformedData + private class TransformedData : InputData { - public float Age { get; set; } - public float Salary { get; set; } public bool IsUnderThirty { get; set; } } - - // Returns an enumerable of input rows. - public static IEnumerable GetData() - { - return new List - { - new InputData { - Age = 26, - Salary = 40000, - }, - new InputData { - Age = 35, - Salary = 80000, - }, - new InputData { - Age = 34, - Salary = 10000, - }, - new InputData { - Age = 28, - Salary = 100000, - }, - }; - } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs index 796aa995c1..45568e99b6 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -1,9 +1,10 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class IndicateMissingValues { @@ -13,19 +14,19 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); + // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List() { - new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, - new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, - new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} }, + new DataPoint(){ Features = new float[3] {1, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Features = new float[3] {-1, float.NaN, -3} }, }; - // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); - // IndicateMissingValues is used to create a boolean containing - // 'true' where the value in the input column is NaN. This value can be used - // to replace missing values with other values. - IEstimator pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); + // IndicateMissingValues is used to create a boolean containing 'true' where the value in the + // input column is missing. For floats, missing values are float.NaN. + // The newly created value can be used to replace missing values with other values. + var pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. @@ -35,32 +36,18 @@ public static void Example() // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); - // a small printing utility - Func vectorPrinter = (object[] vector) => - { - string preview = "["; - foreach (var slot in vector) - preview += $"{slot} "; - return preview += "]"; - - }; - // And finally, we can write out the rows of the dataset, looking at the columns of interest. foreach (var row in rowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast().ToArray())}"); - } + Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingIndicator: [{string.Join(", ", row.MissingIndicator)}]"); // Expected output: - // - // Label: 3 Features: [1 1 0] MissingIndicator: [False False False] - // Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False] - // Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False] + // Features: [1, 1, 0] MissingIndicator: [False, False, False] + // Features: [0, NaN, 1] MissingIndicator: [False, True, False] + // Features: [-1, NaN, -3] MissingIndicator: [False, True, False] } private class DataPoint { - public float Label { get; set; } [VectorType(3)] public float[] Features { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs index 4a0011f5fd..1cf1822124 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs @@ -1,9 +1,10 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; using Microsoft.ML.Data; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class IndicateMissingValuesMultiColumn { @@ -13,20 +14,19 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); + // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List() { - new DataPoint(){ Label = 3, Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, - new DataPoint(){ Label = 32, Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {float.NaN, 1} }, - new DataPoint(){ Label = float.NaN, Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {1, float.PositiveInfinity} }, + new DataPoint(){ Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, + new DataPoint(){ Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {float.NaN, 1} }, + new DataPoint(){ Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {1, float.PositiveInfinity} }, }; - // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); - // IndicateMissingValues is used to create a boolean containing - // 'true' where the value in the input column is NaN. This value can be used - // to replace missing values with other values. We can use an array of InputOutputColumnPair - // to apply the MissingValueIndicatorEstimator to multiple columns in one pass over the data. - IEstimator pipeline = mlContext.Transforms.IndicateMissingValues(new[] { + // IndicateMissingValues is used to create a boolean containing 'true' where the value in the + // input column is missing. For floats, missing values are float.NaN. + // We can use an array of InputOutputColumnPair to apply the MissingValueIndicatorEstimator to multiple columns in one pass over the data. + var pipeline = mlContext.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("MissingIndicator1", "Features1"), new InputOutputColumnPair("MissingIndicator2", "Features2") }); @@ -39,34 +39,19 @@ public static void Example() // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); - // a small printing utility - Func vectorPrinter = (object[] vector) => - { - string preview = "["; - foreach (var slot in vector) - preview += $"{slot} "; - return preview += "]"; - - }; - // And finally, we can write out the rows of the dataset, looking at the columns of interest. foreach (var row in rowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast().ToArray())} " + - $"Features2: {vectorPrinter(row.Features2.Cast().ToArray())} " + - $"MissingIndicator1: {vectorPrinter(row.MissingIndicator1.Cast().ToArray())} " + - $"MissingIndicator2: {vectorPrinter(row.MissingIndicator2.Cast().ToArray())}"); - } + Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingIndicator1: [{string.Join(", ", row.MissingIndicator1)}]\t " + + $"Features2: [{string.Join(", ", row.Features2)}]\t MissingIndicator2: [{string.Join(", ", row.MissingIndicator2)}]"); // Expected output: - // Label: 3 Features1: [1 1 0] Features2: [1 1] MissingIndicator1: [False False False] MissingIndicator2: [False False] - // Label: 32 Features1: [0 NaN 1] Features2: [NaN 1] MissingIndicator1: [False True False] MissingIndicator2: [True False] - // Label: NaN Features1: [-1 NaN -3 ] Features2: [1 ∞ ] MissingIndicator1: [False True False] MissingIndicator2: [False False] + // Features1: [1, 1, 0] MissingIndicator1: [False, False, False] Features2: [1, 1] MissingIndicator2: [False, False] + // Features1: [0, NaN, 1] MissingIndicator1: [False, True, False] Features2: [NaN, 1] MissingIndicator2: [True, False] + // Features1: [-1, NaN, -3] MissingIndicator1: [False, True, False] Features2: [1, ∞] MissingIndicator2: [False, False] } private class DataPoint { - public float Label { get; set; } [VectorType(3)] public float[] Features1 { get; set; } [VectorType(2)] diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs index dbe433636f..8143a9679e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -1,10 +1,11 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Transforms; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { class ReplaceMissingValues { @@ -13,81 +14,62 @@ public static void Example() // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List() { - new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, - new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, - new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, - new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, + new DataPoint(){ Features = new float[3] {float.PositiveInfinity, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Features = new float[3] {-1, 2, -3} }, + new DataPoint(){ Features = new float[3] {-1, float.NaN, -3} }, }; - // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); + // Default ReplaceMode: // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.Mean); + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.DefaultValue); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var meanTransformer = meanPipeline.Fit(data); - var meanTransformedData = meanTransformer.Transform(data); + var defaultTransformedData = defaultPipeline.Fit(data).Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. - var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); + var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in defaultRowEnumerable) + Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingReplaced: [{string.Join(", ", row.MissingReplaced)}]"); + + // Expected output: + // Features: [∞, 1, 0] MissingReplaced: [∞, 1, 0] + // Features: [0, NaN, 1] MissingReplaced: [0, 0, 1] + // Features: [-1, 2, -3] MissingReplaced: [-1, 2, -3] + // Features: [-1, NaN, -3] MissingReplaced: [-1, 0, -3] + + // Mean ReplaceMode: // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.DefaultValue); + var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.Mean); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var defaultTransformer = defaultPipeline.Fit(data); - var defaultTransformedData = defaultTransformer.Transform(data); + var meanTransformedData = meanPipeline.Fit(data).Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. - var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); - - // a small printing utility - Func vectorPrinter = (object[] vector) => - { - string preview = "["; - foreach (var slot in vector) - preview += $"{slot} "; - return preview += "]"; - - }; + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); // And finally, we can write out the rows of the dataset, looking at the columns of interest. foreach (var row in meanRowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); - } + Console.WriteLine($"Features: [{string.Join(", ", row.Features)}]\t MissingReplaced: [{string.Join(", ", row.MissingReplaced)}]"); // Expected output: - // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row - // - //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] - //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1] - //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3] - //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3] - - // And finally, we can write out the rows of the dataset, looking at the columns of interest. - foreach (var row in defaultRowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); - } - - // Expected output: - // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. - // - //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] - //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1] - //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3] - //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3] + // Features: [∞, 1, 0] MissingReplaced: [∞, 1, 0] + // Features: [0, NaN, 1] MissingReplaced: [0, 1.5, 1] + // Features: [-1, 2, -3] MissingReplaced: [-1, 2, -3] + // Features: [-1, NaN, -3] MissingReplaced: [-1, 1.5, -3] } private class DataPoint { - public float Label { get; set; } - [VectorType(3)] public float[] Features { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs index c851254f43..ed73547cba 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs @@ -1,10 +1,11 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Transforms; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { class ReplaceMissingValuesMultiColumn { @@ -13,95 +14,72 @@ public static void Example() // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable and convert it to an IDataView. var samples = new List() { - new DataPoint(){ Label = 3, Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, - new DataPoint(){ Label = 32, Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {0, 1} }, - new DataPoint(){ Label = 5, Features1 = new float[3] {-1, 2, -3}, Features2 = new float[2] {-1, float.NaN} }, - new DataPoint(){ Label = 9, Features1 = new float[3] {-1, 6, -3}, Features2 = new float[2] {0, float.PositiveInfinity} }, + new DataPoint(){ Features1 = new float[3] {1, 1, 0}, Features2 = new float[2] {1, 1} }, + new DataPoint(){ Features1 = new float[3] {0, float.NaN, 1}, Features2 = new float[2] {0, 1} }, + new DataPoint(){ Features1 = new float[3] {-1, float.NaN, -3}, Features2 = new float[2] {-1, float.NaN} }, + new DataPoint(){ Features1 = new float[3] {-1, 6, -3}, Features2 = new float[2] {0, float.PositiveInfinity} }, }; - // Convert training data to IDataView, the general data type used in ML.NET. var data = mlContext.Data.LoadFromEnumerable(samples); + // Default ReplaceMode: // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { new InputOutputColumnPair("MissingReplaced1", "Features1"), new InputOutputColumnPair("MissingReplaced2", "Features2") }, - MissingValueReplacingEstimator.ReplacementMode.Mean); + MissingValueReplacingEstimator.ReplacementMode.DefaultValue); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var meanTransformer = meanPipeline.Fit(data); - var meanTransformedData = meanTransformer.Transform(data); + var defaultTransformedData = defaultPipeline.Fit(data).Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. - var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); + var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in defaultRowEnumerable) + Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingReplaced1: [{string.Join(", ", row.MissingReplaced1)}]\t " + + $"Features2: [{ string.Join(", ", row.Features2)}]\t MissingReplaced2: [{string.Join(", ", row.MissingReplaced2)}]"); + + // Expected output: + // Features1: [1, 1, 0] MissingReplaced1: [1, 1, 0] Features2: [1, 1] MissingReplaced2: [1, 1] + // Features1: [0, NaN, 1] MissingReplaced1: [0, 0, 1] Features2: [0, 1] MissingReplaced2: [0, 1] + // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 0, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 0] + // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] + + // Mean ReplaceMode: // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { + var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { new InputOutputColumnPair("MissingReplaced1", "Features1"), new InputOutputColumnPair("MissingReplaced2", "Features2") }, - MissingValueReplacingEstimator.ReplacementMode.DefaultValue); + MissingValueReplacingEstimator.ReplacementMode.Mean); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var defaultTransformer = defaultPipeline.Fit(data); - var defaultTransformedData = defaultTransformer.Transform(data); + var meanTransformedData = meanPipeline.Fit(data).Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. - var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); - - // a small printing utility - Func vectorPrinter = (object[] vector) => - { - string preview = "["; - foreach (var slot in vector) - preview += $"{slot} "; - return preview += "]"; - - }; + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); // And finally, we can write out the rows of the dataset, looking at the columns of interest. foreach (var row in meanRowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast().ToArray())}" + - $" Features2: {vectorPrinter(row.Features2.Cast().ToArray())} " + - $" MissingReplaced1: {vectorPrinter(row.MissingReplaced1.Cast().ToArray())}" + - $" MissingReplaced2: {vectorPrinter(row.MissingReplaced2.Cast().ToArray())}"); - } + Console.WriteLine($"Features1: [{string.Join(", ", row.Features1)}]\t MissingReplaced1: [{string.Join(", ", row.MissingReplaced1)}]\t " + + $"Features2: [{ string.Join(", ", row.Features2)}]\t MissingReplaced2: [{string.Join(", ", row.MissingReplaced2)}]"); // Expected output: - // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row - // - // Label: 3 Features1: [1 1 0] Features2: [1 1] MissingReplaced1: [1 1 0] MissingReplaced2: [1 1] - // Label: 32 Features1: [0 NaN 1] Features2: [0 1] MissingReplaced1: [0 3 1] MissingReplaced2: [0 1] - // Label: 5 Features1: [-1 2 - 3] Features2: [-1 NaN] MissingReplaced1: [-1 2 - 3] MissingReplaced2: [-1 1] - // Label: 9 Features1: [-1 6 - 3] Features2: [0 ∞ ] MissingReplaced1: [-1 6 - 3] MissingReplaced2: [0 ∞ ] - - // And finally, we can write out the rows of the dataset, looking at the columns of interest. - foreach (var row in defaultRowEnumerable) - { - Console.WriteLine($"Label: {row.Label} Features1: {vectorPrinter(row.Features1.Cast().ToArray())}" + - $" Features2: {vectorPrinter(row.Features2.Cast().ToArray())} " + - $" MissingReplaced1: {vectorPrinter(row.MissingReplaced1.Cast().ToArray())}" + - $"MissingReplaced2: {vectorPrinter(row.MissingReplaced2.Cast().ToArray())}"); - } - - // Expected output: - // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. - // - // Label: 3 Features1: [1 1 0] Features2: [1 1] MissingReplaced1: [1 1 0]MissingReplaced2: [1 1] - // Label: 32 Features1: [0 NaN 1] Features2: [0 1] MissingReplaced1: [0 0 1]MissingReplaced2: [0 1] - // Label: 5 Features1: [-1 2 - 3] Features2: [-1 NaN] MissingReplaced1: [-1 2 - 3]MissingReplaced2: [-1 0] - // Label: 9 Features1: [-1 6 - 3] Features2: [0 ∞ ] MissingReplaced1: [-1 6 - 3]MissingReplaced2: [0 ∞ ] + // Features1: [1, 1, 0] MissingReplaced1: [1, 1, 0] Features2: [1, 1] MissingReplaced2: [1, 1] + // Features1: [0, NaN, 1] MissingReplaced1: [0, 3.5, 1] Features2: [0, 1] MissingReplaced2: [0, 1] + // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 3.5, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 1] + // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] } private class DataPoint { - public float Label { get; set; } - [VectorType(3)] public float[] Features1 { get; set; } [VectorType(2)] diff --git a/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs b/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs index cb6978ddfa..b8d282a5a3 100644 --- a/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs +++ b/src/Microsoft.ML.Core/ComponentModel/ComponentCatalog.cs @@ -1020,12 +1020,11 @@ private void LoadExtensions(Assembly assembly, bool throwOnError) /// Gets a value indicating whether can contain extensions. /// /// - /// All ML.NET product assemblies won't contain extensions besides Microsoft.ML.Samples. + /// All ML.NET product assemblies won't contain extensions. /// private static bool CanContainExtensions(Assembly assembly) { if (assembly.FullName.StartsWith("Microsoft.ML.", StringComparison.Ordinal) - && !assembly.FullName.StartsWith("Microsoft.ML.Samples", StringComparison.Ordinal) && HasMLNetPublicKey(assembly)) { return false; diff --git a/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs b/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs index b2ce59c0e6..0bab219ee4 100644 --- a/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs +++ b/src/Microsoft.ML.Transforms/CustomMappingCatalog.cs @@ -26,8 +26,8 @@ public static class CustomMappingCatalog /// /// /// /// public static CustomMappingEstimator CustomMapping(this TransformsCatalog catalog, Action mapAction, string contractName, diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index a2e0c2c57c..5ff3f5a47c 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -24,7 +24,7 @@ public static class ExtensionsCatalog /// /// /// /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, @@ -41,7 +41,7 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// /// /// /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, InputOutputColumnPair[] columns) @@ -69,7 +69,7 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// /// /// /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, @@ -92,7 +92,7 @@ public static MissingValueReplacingEstimator ReplaceMissingValues(this Transform /// /// /// /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, From 9473b46a795ec23b2742ce69934cec16f729112b Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 8 Apr 2019 15:16:37 -0700 Subject: [PATCH 3/7] remove sample from file name --- .../Transforms/{CustomMappingSample.cs => CustomMapping.cs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/{CustomMappingSample.cs => CustomMapping.cs} (100%) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs similarity index 100% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSample.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs From ce038f94ea421fd6d88487de4164168174096992 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 8 Apr 2019 16:02:38 -0700 Subject: [PATCH 4/7] update to test.snk --- docs/samples/Directory.Build.props | 2 +- docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/samples/Directory.Build.props b/docs/samples/Directory.Build.props index 2d9fe63b48..166f71b20c 100644 --- a/docs/samples/Directory.Build.props +++ b/docs/samples/Directory.Build.props @@ -1,6 +1,6 @@ - $(ToolsDir)Samples.snk + $(ToolsDir)Test.snk \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 01106e7ee5..a990731098 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -3,6 +3,8 @@ netcoreapp2.1 Exe + Samples.snk + true From 779f5f6f5d34e69cfb8cc4f7d151c6c34a7b0072 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 8 Apr 2019 16:59:55 -0700 Subject: [PATCH 5/7] making a few corrections and updating proj csproj file --- docs/samples/Directory.Build.props | 6 ------ .../Dynamic/Transforms/CustomMapping.cs | 7 +++++-- .../Dynamic/Transforms/CustomMappingSaveAndLoad.cs | 8 ++++---- .../Dynamic/Transforms/IndicateMissingValues.cs | 1 - .../Transforms/IndicateMissingValuesMultiColumn.cs | 1 - .../Dynamic/Transforms/ReplaceMissingValues.cs | 7 ++++--- .../Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs | 7 ++++--- .../Microsoft.ML.Samples/Microsoft.ML.Samples.csproj | 4 ++-- 8 files changed, 19 insertions(+), 22 deletions(-) delete mode 100644 docs/samples/Directory.Build.props diff --git a/docs/samples/Directory.Build.props b/docs/samples/Directory.Build.props deleted file mode 100644 index 166f71b20c..0000000000 --- a/docs/samples/Directory.Build.props +++ /dev/null @@ -1,6 +0,0 @@ - - - - $(ToolsDir)Test.snk - - \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs index ffec87ffad..bd6844de91 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs @@ -30,9 +30,12 @@ public static void Example() // Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, cannot be saved and loaded back. // See other sample on how to load and save the CustomMapping estimator. var pipeline = mlContext.Transforms.CustomMapping(mapping, contractName: null); - var transformedData = pipeline.Fit(data).Transform(data); - // Printing the output data. + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var transformer = pipeline.Fit(data); + var transformedData = transformer.Transform(data); + var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); Console.WriteLine("Age\t IsUnderThirty"); foreach (var row in dataEnumerable) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs index b007154e75..69c186c080 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs @@ -25,20 +25,20 @@ public static void Example() // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. var pipeline = mlContext.Transforms.CustomMapping(new IsUnderThirtyCustomAction().GetMapping(), contractName: "IsUnderThirty"); - var transform = pipeline.Fit(data); + var transformer = pipeline.Fit(data); // To save and load the CustomMapping estimator, the assembly in which the custom action is defined needs to be registered in the // environment. The following registers the assembly where IsUnderThirtyCustomAction is defined. mlContext.ComponentCatalog.RegisterAssembly(typeof(IsUnderThirtyCustomAction).Assembly); // Now the transform pipeline can be saved and loaded through the usual MLContext method. - mlContext.Model.Save(transform, data.Schema, "customTransform.zip"); + mlContext.Model.Save(transformer, data.Schema, "customTransform.zip"); var loadedTransform = mlContext.Model.Load("customTransform.zip", out var inputSchema); - // Transform the data using the CustomMapping transform that was saved and loaded. + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. var transformedData = loadedTransform.Transform(data); - // Printing the output data. var dataEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: true); Console.WriteLine("Age\tIsUnderThirty"); foreach (var row in dataEnumerable) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs index 45568e99b6..c3cec8af1d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs index 1cf1822124..dcb902fe3b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs index 8143a9679e..dd2bc47337 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -31,7 +30,8 @@ public static void Example() // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var defaultTransformedData = defaultPipeline.Fit(data).Transform(data); + var defaultTransformer = defaultPipeline.Fit(data); + var defaultTransformedData = defaultTransformer.Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); @@ -52,7 +52,8 @@ public static void Example() // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var meanTransformedData = meanPipeline.Fit(data).Transform(data); + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs index ed73547cba..d5deec8791 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Transforms; @@ -35,7 +34,8 @@ public static void Example() // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var defaultTransformedData = defaultPipeline.Fit(data).Transform(data); + var defaultTransformer = defaultPipeline.Fit(data); + var defaultTransformedData = defaultTransformer.Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); @@ -61,7 +61,8 @@ public static void Example() // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. - var meanTransformedData = meanPipeline.Fit(data).Transform(data); + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index a990731098..363419b243 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -1,9 +1,9 @@  - + netcoreapp2.1 Exe - Samples.snk + $(ToolsDir)Test.snk true From da90716d4366e86963a0a1c2df3cd7f3248c9ab9 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Tue, 9 Apr 2019 16:00:52 -0700 Subject: [PATCH 6/7] review comments and remove signing altogether --- .../Dynamic/Transforms/CustomMapping.cs | 4 ++-- .../Dynamic/Transforms/IndicateMissingValues.cs | 3 +-- .../Transforms/IndicateMissingValuesMultiColumn.cs | 5 +++-- .../Dynamic/Transforms/ReplaceMissingValues.cs | 12 ++++++------ .../Transforms/ReplaceMissingValuesMultiColumn.cs | 6 ++---- .../Microsoft.ML.Samples/Microsoft.ML.Samples.csproj | 4 ++-- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs index bd6844de91..9a0e7aabf3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMapping.cs @@ -27,8 +27,8 @@ public static void Example() (input, output) => output.IsUnderThirty = input.Age < 30; // Custom transformations can be used to transform data directly, or as part of a pipeline of estimators. - // Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, cannot be saved and loaded back. - // See other sample on how to load and save the CustomMapping estimator. + // Note: If contractName is null in the CustomMapping estimator, any pipeline of estimators containing it, + // cannot be saved and loaded back. var pipeline = mlContext.Transforms.CustomMapping(mapping, contractName: null); // Now we can transform the data and look at the output to confirm the behavior of the estimator. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs index c3cec8af1d..5f49e7a157 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -23,8 +23,7 @@ public static void Example() var data = mlContext.Data.LoadFromEnumerable(samples); // IndicateMissingValues is used to create a boolean containing 'true' where the value in the - // input column is missing. For floats, missing values are float.NaN. - // The newly created value can be used to replace missing values with other values. + // input column is missing. For floats and doubles, missing values are represented as NaN. var pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); // Now we can transform the data and look at the output to confirm the behavior of the estimator. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs index dcb902fe3b..830fb9d047 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValuesMultiColumn.cs @@ -23,8 +23,9 @@ public static void Example() var data = mlContext.Data.LoadFromEnumerable(samples); // IndicateMissingValues is used to create a boolean containing 'true' where the value in the - // input column is missing. For floats, missing values are float.NaN. - // We can use an array of InputOutputColumnPair to apply the MissingValueIndicatorEstimator to multiple columns in one pass over the data. + // input column is missing. For floats and doubles, missing values are NaN. + // We can use an array of InputOutputColumnPair to apply the MissingValueIndicatorEstimator + // to multiple columns in one pass over the data. var pipeline = mlContext.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("MissingIndicator1", "Features1"), new InputOutputColumnPair("MissingIndicator2", "Features2") diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs index dd2bc47337..adcf5c0b75 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -24,9 +24,9 @@ public static void Example() }; var data = mlContext.Data.LoadFromEnumerable(samples); - // Default ReplaceMode: - // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.DefaultValue); + // Here we use the default replacement mode, which replaces the value with the default value for its type. + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", + MissingValueReplacingEstimator.ReplacementMode.DefaultValue); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. @@ -46,9 +46,9 @@ public static void Example() // Features: [-1, 2, -3] MissingReplaced: [-1, 2, -3] // Features: [-1, NaN, -3] MissingReplaced: [-1, 0, -3] - // Mean ReplaceMode: - // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. - var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", MissingValueReplacingEstimator.ReplacementMode.Mean); + // Here we use the mean replacement mode, which replaces the value with the mean of the non values that were not missing. + var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", + MissingValueReplacingEstimator.ReplacementMode.Mean); // Now we can transform the data and look at the output to confirm the behavior of the estimator. // This operation doesn't actually evaluate data until we read the data below. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs index d5deec8791..aa5d1acf5b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValuesMultiColumn.cs @@ -24,8 +24,7 @@ public static void Example() }; var data = mlContext.Data.LoadFromEnumerable(samples); - // Default ReplaceMode: - // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + // Here we use the default replacement mode, which replaces the value with the default value for its type. var defaultPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { new InputOutputColumnPair("MissingReplaced1", "Features1"), new InputOutputColumnPair("MissingReplaced2", "Features2") @@ -51,8 +50,7 @@ public static void Example() // Features1: [-1, NaN, -3] MissingReplaced1: [-1, 0, -3] Features2: [-1, NaN] MissingReplaced2: [-1, 0] // Features1: [-1, 6, -3] MissingReplaced1: [-1, 6, -3] Features2: [0, ∞] MissingReplaced2: [0, ∞] - // Mean ReplaceMode: - // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + // Here we use the mean replacement mode, which replaces the value with the mean of the non values that were not missing. var meanPipeline = mlContext.Transforms.ReplaceMissingValues(new[] { new InputOutputColumnPair("MissingReplaced1", "Features1"), new InputOutputColumnPair("MissingReplaced2", "Features2") diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 363419b243..22c78a4a4a 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -3,8 +3,8 @@ netcoreapp2.1 Exe - $(ToolsDir)Test.snk - true + false + false From 7e11f86161a921984e2732cdf6e90624ef1698a0 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Wed, 10 Apr 2019 09:55:03 -0700 Subject: [PATCH 7/7] review comments --- .../Dynamic/Transforms/CustomMappingSaveAndLoad.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs index 69c186c080..8bc4439190 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/CustomMappingSaveAndLoad.cs @@ -82,7 +82,6 @@ private class InputData private class TransformedData : InputData { public bool IsUnderThirty { get; set; } - } } }