From 074b8000cee9f6dd3a7cb6b096b5f7d908985eac Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 9 Jul 2018 10:39:15 -0700
Subject: [PATCH 01/14] Moving from xml strings to having the documentation
 details in xml files. For the summary text that is common between several
 learners, the examples will be added on a separate node. An example of how
 that will look like is in the LogisticRegressionBinaryClassifier and
 LogisticRegressionClassifier.

---
 docs/code/xmlIncludes/Learners.xml            | 274 +++++++++++++
 docs/code/xmlIncludes/Pca.xml                 |  27 ++
 docs/code/xmlIncludes/Transforms.xml          |  52 +++
 .../EntryPoints/ModuleArgs.cs                 |   4 +-
 .../EntryPoints/ModuleCatalog.cs              |   4 +-
 src/Microsoft.ML.FastTree/FastTree.cs         |  25 --
 .../FastTreeArguments.cs                      |   1 +
 .../FastTreeClassification.cs                 |   7 +-
 src/Microsoft.ML.FastTree/FastTreeRanking.cs  |   5 +-
 .../FastTreeRegression.cs                     |   5 +-
 src/Microsoft.ML.FastTree/FastTreeTweedie.cs  |  17 +-
 src/Microsoft.ML.FastTree/RandomForest.cs     |  22 --
 .../RandomForestClassification.cs             |   5 +-
 .../RandomForestRegression.cs                 |   5 +-
 .../KMeansPlusPlusTrainer.cs                  |  13 +-
 .../LightGbmBinaryTrainer.cs                  |   5 +-
 .../LightGbmMulticlassTrainer.cs              |   5 +-
 .../LightGbmRankingTrainer.cs                 |   7 +-
 .../LightGbmRegressionTrainer.cs              |   6 +-
 .../LightGbmTrainerBase.cs                    |   3 -
 src/Microsoft.ML.PCA/PcaTrainer.cs            |  24 +-
 src/Microsoft.ML.PCA/PcaTransform.cs          |   7 +-
 .../FactorizationMachineTrainer.cs            |  28 +-
 .../Standard/LinearClassificationTrainer.cs   |  24 +-
 .../LogisticRegression/LbfgsPredictorBase.cs  |  29 --
 .../LogisticRegression/LogisticRegression.cs  |   8 +-
 .../MulticlassLogisticRegression.cs           |   7 +-
 .../Standard/Online/AveragedPerceptron.cs     |  36 +-
 .../Standard/Online/OnlineGradientDescent.cs  |  10 +-
 .../PoissonRegression/PoissonRegression.cs    |  12 +-
 .../Standard/SdcaMultiClass.cs                |  11 +-
 .../Standard/SdcaRegression.cs                |   7 +-
 .../CategoricalHashTransform.cs               |   1 +
 .../CategoricalTransform.cs                   |  22 +-
 src/Microsoft.ML/CSharpApi.cs                 | 374 ++----------------
 .../Internal/Tools/CSharpApiGenerator.cs      |   2 +-
 .../Internal/Tools/CSharpGeneratorUtils.cs    |  15 +-
 37 files changed, 502 insertions(+), 607 deletions(-)
 create mode 100644 docs/code/xmlIncludes/Learners.xml
 create mode 100644 docs/code/xmlIncludes/Pca.xml
 create mode 100644 docs/code/xmlIncludes/Transforms.xml
diff --git a/docs/code/xmlIncludes/Learners.xml b/docs/code/xmlIncludes/Learners.xml
new file mode 100644
index 0000000000..3c3ddd6bf0
--- /dev/null
+++ b/docs/code/xmlIncludes/Learners.xml
@@ -0,0 +1,274 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+    
+    <member name="FieldAwareFactorizationMachineBinaryClassifier">
+      <summary>
+        Train a field-aware factorization machine for binary classification using ADAGRAD (an advanced stochastic gradient method). 
+      </summary>
+      <remarks>
+        Field Aware Factorization Machines use, in addition to the input variables, factorized parameters to model the interaction between pairs of variables.
+        The algorithm is particularly useful for high dimensional datasets which can be very sparse (e.g. click-prediction for advertising systems).
+        An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
+        <para> For a general idea of what Field-aware Factorization Machines are see: <a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
+        </para>
+        <para>See references below for more details. 
+        This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
+        </para>
+          <list >
+            <item>
+              <description>
+                [1] <a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
+            <item>
+              <description>
+                [2] <a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
+              </description>
+            </item>
+            <item>
+              <description>
+                [3] <a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a>
+              </description>
+            </item>
+          </list>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new FieldAwareFactorizationMachineBinaryClassifier(){ LearningRate = 0.5f, Iter=2 });
+        </code>
+      </example>
+    </member>
+
+    <member name="SDCA">
+      <summary>
+        Train an SDCA linear model.
+      </summary>
+      <remarks>
+        This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+        The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
+        <para>
+          Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+          Several choices of loss functions are also provided.
+          The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+        </para>
+        <para>
+          Note that SDCA is a stochastic and streaming optimization algorithm.
+          The results depends on the order of the training data. For reproducible results, it is recommended that one sets 'Shuffle' to
+          False and 'NumThreads' to 1.
+          Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
+          In general, the larger the 'L2Const', the faster SDCA converges.
+        </para>
+        <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
+        <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
+      </remarks>
+    </member>
+
+    <member name="FastTree">
+      <summary>
+        Trains gradient boosted decision trees to the LambdaRank quasi-gradient. 
+      </summary>
+      <remarks>
+        <para>
+          FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
+          Gradient boosting is a machine learning technique for regression problems.
+          It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next.
+          So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
+        </para>
+        <para>
+          MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves.
+          A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input.
+          At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x &lt;= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature.
+          The functions that can be produced by a regression tree are all the piece-wise constant functions.
+        </para>
+        <para>
+          The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
+          The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
+        </para>
+        <list type='bullet'>
+          <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
+          <item>In case of a regression problem, the output is the predicted value of the function.</item>
+          <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
+        </list>
+        <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
+        <a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
+      </remarks>
+    </member>
+    
+    <member name="FastForest">
+      <summary>
+        Trains a random forest to fit target values using least-squares.
+      </summary>
+      <remarks>
+        Decision trees are non-parametric models that perform a sequence of simple tests on inputs.
+        This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed.
+        A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
+        <para>Decision trees have several advantages:</para>
+        <list type='bullet'>
+          <item><description>They are efficient in both computation and memory usage during training and prediction. </description></item>
+          <item><description>They can represent non-linear decision boundaries.</description></item>
+          <item><description>They perform integrated feature selection and classification. </description></item>
+          <item><description>They are resilient in the presence of noisy features.</description></item>
+        </list>
+        Fast forest is a random forest implementation.
+        The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction.
+        An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
+        This decision forest classifier consists of an ensemble of decision trees.
+        Generally, ensemble models provide better coverage and accuracy than single decision trees.
+        Each tree in a decision forest outputs a Gaussian distribution.
+        <a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
+        <a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
+        <a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
+      </remarks>
+    </member>
+
+    <member name="LightGBM">
+      <summary>
+        Trains a Light GBM Model.
+      </summary>
+      <remarks>
+        Light GBM is an open source implementation of boosted trees.
+        <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a>
+      </remarks>
+    </member>
+
+    <member name="LBFGS">
+      <summary>
+        Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. 
+        The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
+      </summary>
+      <remarks>
+        If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        <para>
+          The optimization technique used for LogisticRegression Classifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS).
+          Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps.
+          But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction,
+          so that it is especially suited for problems with a large number of variables.
+          The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+        </para>
+        <para>
+          This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations.
+          Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values.
+          This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff.
+          Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis.
+          An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+        </para>
+          <list type='bullet'>
+            <item>
+              <description>
+              <paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data.
+              It pulls small weights associated features that are relatively unimportant towards 0.
+            </description>
+            </item>
+            <item>
+              <description>
+                <paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero.
+              </description>
+            </item>
+          </list>
+          Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms.
+          The default values of x and y are both 1.
+          An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+        <para>For more information see:</para>
+        <list type='bullet'>
+          <item><description><a href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</a>.</description></item>
+          <item><description><a href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</a>.</description></item>
+          <item><description><a href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</a>.</description></item>
+          <item><description><a href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</a>.</description></item>
+        </list>
+      </remarks>
+    </member>
+    <example name='LogisticRegressionClassifier'>
+      <example>
+        <code>
+          pipeline.Add(new LogisticRegressionClassifier());
+        </code>
+      </example>
+    </example>
+    <example name='LogisticRegressionBinaryClassifier'>
+      <example>
+        <code>
+          pipeline.Add(new LogisticRegressionBinaryClassifier());
+        </code>
+      </example>
+    </example>
+
+    <member name="KMeans++">
+      <summary>
+        K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified 
+        number of clusters in order to minimize the within-cluster sum of squares.
+      </summary>
+      <remarks>
+        K-means++ improves upon K-means by using the <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=252149'>Yinyang K-Means</a> method for choosing the initial cluster centers.
+        YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).
+        YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
+        It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
+        <para>For more information on K-means, and K-means++ see:</para>
+        <para><a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.</para>
+        <para><a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a></para>
+      </remarks>
+    </member>
+
+    <member name="OGD">
+      <summary>
+        Stochastic gradient descent is an optimization method used to train a wide range of models in machine learning. 
+        In the ML.Net the implementation of OGD, it is for linear regression. 
+      </summary>
+      <remarks>
+        Stochastic gradient descent uses a simple yet efficient iterative technique to fit model coefficients using error gradients for convex loss functions.
+        The OnlineGradientDescentRegressor implements the standard (non-batch) SGD, with a choice of loss functions,
+        and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
+      </remarks>
+    </member>
+
+    <member name="AP">
+      <summary>
+        Averaged Perceptron Binary Classifier. 
+      </summary>
+      <remarks>
+        Perceptron is a classification algorithm that makes its predictions based on a linear function.
+        I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
+        <para>
+          Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
+          The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed.
+          If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
+          the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
+          multiplied by a factor 0 &lt; a &lt;= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate,
+          and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
+        </para>
+        <para>
+          In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored,
+          together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
+          The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
+        </para>
+        <para> For more information see:</para>
+        <para><a href='https://en.wikipedia.org/wiki/Perceptron'>Wikipedia entry for Perceptron</a></para>
+        <para><a href='http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.8200'>Large Margin Classification Using the Perceptron Algorithm</a></para>
+      </remarks>
+    </member>
+    
+    <member name="PoissonRegression">
+      <summary>
+        Trains a Poisson Regression model.  
+      </summary>
+      <remarks>
+        <a href='https://en.wikipedia.org/wiki/Poisson_regression'>Poisson regression</a> is a parameterized regression method.
+        It assumes that the log of the conditional mean of the dependent variable follows a linear function of the dependent variables.
+        Assuming that the dependent variable follows a Poisson distribution, the parameters of the regressor can be estimated by maximizing the likelihood of the obtained observations.
+      </remarks>
+    </member>
+
+    <member name="FastTreeTweedieRegression">
+      <summary>
+        Trains gradient boosted decision trees to fit target values using a Tweedie loss function. 
+        This learner is a generalization of Poisson, compound Poisson, and gamma regression.
+      </summary>
+      <remarks>
+        The Tweedie boosting model follows the mathematics established in <a href="https://arxiv.org/pdf/1508.06378.pdf">
+        Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models.</a> from Yang, Quan, and Zou. 
+        For an introduction to Gradient Boosting, and more information, see:
+        <para><a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a></para>
+        <para><a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a></para>
+      </remarks>
+    </member>
+        
+  </members>
+</docs>
\ No newline at end of file
diff --git a/docs/code/xmlIncludes/Pca.xml b/docs/code/xmlIncludes/Pca.xml
new file mode 100644
index 0000000000..98d423b754
--- /dev/null
+++ b/docs/code/xmlIncludes/Pca.xml
@@ -0,0 +1,27 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+    
+    <member name="PCA">
+      <summary>
+        PCA is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace. 
+      </summary>
+      <remarks>
+      <a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace.
+      Its training is done using the technique described in the paper: <a href='https://arxiv.org/pdf/1310.6304v2.pdf'>Combining Structured and Unstructured Randomness in Large Scale PCA</a>,
+      and the paper <see href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</see>
+      <a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
+      <a href='https://arxiv.org/abs/0809.2274'>A randomized algorithm for principal component analysis</a>
+      <a href='http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
+      </remarks>
+      <example>
+        An example of how to add the PcaCalculator transform to a pipeline with a column named &quot;Features&quot;.
+        <code>
+          string[] features = new string[&quot;Sepal length&quot;, &quot;Sepal width&quot;, &quot;Petal length&quot;, &quot;Petal width&quot;];
+          pipeline.Add(new PcaCalculator(columns){ Rank = 3 });
+        </code>
+      </example>
+    </member>
+    
+  </members>
+</docs>
\ No newline at end of file
diff --git a/docs/code/xmlIncludes/Transforms.xml b/docs/code/xmlIncludes/Transforms.xml
new file mode 100644
index 0000000000..7482c3a272
--- /dev/null
+++ b/docs/code/xmlIncludes/Transforms.xml
@@ -0,0 +1,52 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+
+    <member name="CategoricalHashOneHotVectorizer">
+      <summary>
+        Encodes the categorical variable with hash-based encoding. 
+      </summary>
+      <remarks>
+        CategoricalHashOneHotVectorizer converts a categorical value into an indicator array by hashing the
+        value and using the hash as an index in the bag.
+        If the input column is a vector, a single indicator bag is returned for it.
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new CategoricalHashOneHotVectorizer(&quot;Text1&quot;) { HashBits = 10, Seed = 314489979, OutputKind = CategoricalTransformOutputKind.Bag });
+        </code>
+      </example>
+    </member>
+
+    <member name="CategoricalOneHotVectorizer">
+      <summary>
+        Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array
+      </summary>
+      <remarks>
+        <para>The CategoricalOneHotVectorizer transform passes through a data set, operating on text columns, to
+        build a dictionary of categories.
+        For each row, the entire text string appearing in the input column is defined as a category.
+        The output of this transform is an indicator vector.
+        Each slot in this vector corresponds to a category in the dictionary, so its length is the size of the built dictionary.
+        The CategoricalOneHotVectorizer can be applied to one or more columns, in which case it builds and uses a separate dictionary
+        for each column that it is applied to.</para>
+        
+        <para>The <see cref="Microsoft.ML.Runtime.Data.CategoricalTransform.OutputKind"/> produces integer values and <see cref="KeyType"/> columns.
+        The Key value is the one-based index of the slot set in the Ind/Bag options.
+        If the Key option is not found, it is assigned the value zero.
+        In the <see cref="CategoricalTransform.OutputKind.Ind"/>, <see cref="CategoricalTransform.OutputKind.Bag"/> options are not found, they result in an all zero bit vector.
+        <see cref="CategoricalTransform.OutputKind.Ind"/> and <see cref="CategoricalTransform.OutputKind.Bag"/> differ simply in how the bit-vectors generated from individual slots are aggregated:
+        for Ind they are concatenated and for Bag they are added.
+        When the source column is a singleton, the Ind and Bag options are identical.</para>
+      </remarks>
+      <example>
+        An example of how to add the CategoricalOneHotVectorizer transform to a pipeline with two text column 
+        features named &quot;Text1&quot; and &quot;Text2&quot;.
+        <code>
+          pipeline.Add(new CategoricalOneHotVectorizer(&quot;Text1&quot;, &quot;Text1&quot;));
+        </code>
+      </example>
+    </member>
+  
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.Core/EntryPoints/ModuleArgs.cs b/src/Microsoft.ML.Core/EntryPoints/ModuleArgs.cs
index 2c4f877c1b..d1cc853d9b 100644
--- a/src/Microsoft.ML.Core/EntryPoints/ModuleArgs.cs
+++ b/src/Microsoft.ML.Core/EntryPoints/ModuleArgs.cs
@@ -529,9 +529,9 @@ public sealed class EntryPointAttribute : Attribute
             public string ShortName { get; set; }
 
             /// <summary>
-            /// Remarks on the Entry Point, for more extensive XML documentation on the C#API
+            /// The path to the XML documentation on the C#API component
             /// </summary>
-            public string Remarks { get; set; }
+            public string[] XmlInclude { get; set; }
         }
 
         /// <summary>
diff --git a/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs b/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs
index af45202937..93db75c169 100644
--- a/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs
+++ b/src/Microsoft.ML.Core/EntryPoints/ModuleCatalog.cs
@@ -44,7 +44,7 @@ public sealed class EntryPointInfo
             public readonly string Description;
             public readonly string ShortName;
             public readonly string FriendlyName;
-            public readonly string Remarks;
+            public readonly string[] XmlInclude;
             public readonly MethodInfo Method;
             public readonly Type InputType;
             public readonly Type OutputType;
@@ -64,7 +64,7 @@ internal EntryPointInfo(IExceptionContext ectx, MethodInfo method,
                 Method = method;
                 ShortName = attribute.ShortName;
                 FriendlyName = attribute.UserName;
-                Remarks = attribute.Remarks;
+                XmlInclude = attribute.XmlInclude;
                 ObsoleteAttribute = obsoleteAttribute;
 
                 // There are supposed to be 2 parameters, env and input for non-macro nodes.
diff --git a/src/Microsoft.ML.FastTree/FastTree.cs b/src/Microsoft.ML.FastTree/FastTree.cs
index 01668eb3f2..11b302fbb6 100644
--- a/src/Microsoft.ML.FastTree/FastTree.cs
+++ b/src/Microsoft.ML.FastTree/FastTree.cs
@@ -82,31 +82,6 @@ public abstract class FastTreeTrainerBase<TArgs, TPredictor> :
 
         protected string InnerArgs => CmdParser.GetSettings(Host, Args, new TArgs());
 
-        internal const string Remarks = @"<remarks>
-<para>FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm. 
-Gradient boosting is a machine learning technique for regression problems. 
-It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next. 
-So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
-</para>
-<para>
-MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves. 
-A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input. 
-At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x <= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature. 
-The functions that can be produced by a regression tree are all the piece-wise constant functions.
-</para>
-<para>
-The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
-The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
-</para>
-<list type='bullet'>
-<item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
-<item>In case of a regression problem, the output is the predicted value of the function.</item>
-<item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
-</list>
-<a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
-<a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
-</remarks>";
-
         public override bool NeedNormalization => false;
 
         public override bool WantCaching => false;
diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
index 7262659e43..9bd900a94e 100644
--- a/src/Microsoft.ML.FastTree/FastTreeArguments.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
@@ -20,6 +20,7 @@ public interface IFastTreeTrainerFactory : IComponentFactory<ITrainer>
     {
     }
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer
     {
         [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)]
diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
index f694236166..df05261c3f 100644
--- a/src/Microsoft.ML.FastTree/FastTreeClassification.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
@@ -336,13 +336,16 @@ public void AdjustTreeOutputs(IChannel ch, RegressionTree tree,
         }
     }
 
+    /// <summary>
+    /// The Entry Point for the FastTree Binary Classifier. 
+    /// </summary>
     public static partial class FastTree
     {
         [TlcModule.EntryPoint(Name = "Trainers.FastTreeBinaryClassifier",
             Desc = FastTreeBinaryClassificationTrainer.Summary,
-            Remarks = FastTreeBinaryClassificationTrainer.Remarks,
             UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
-            ShortName = FastTreeBinaryClassificationTrainer.ShortName)]
+            ShortName = FastTreeBinaryClassificationTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
index 70919fbdea..c196f9c303 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRanking.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
@@ -38,6 +38,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRankingTrainer : BoostingFastTreeTrainerBase<FastTreeRankingTrainer.Arguments, FastTreeRankingPredictor>,
         IHasLabelGains
     {
@@ -1098,9 +1099,9 @@ public static partial class FastTree
     {
         [TlcModule.EntryPoint(Name = "Trainers.FastTreeRanker",
             Desc = FastTreeRankingTrainer.Summary,
-            Remarks = FastTreeRankingTrainer.Remarks,
             UserName = FastTreeRankingTrainer.UserNameValue,
-            ShortName = FastTreeRankingTrainer.ShortName)]
+            ShortName = FastTreeRankingTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
index 9c50fe75fe..a9f235f4d8 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRegression.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
@@ -31,6 +31,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRegressionTrainer : BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Arguments, FastTreeRegressionPredictor>
     {
         public const string LoadNameValue = "FastTreeRegression";
@@ -450,9 +451,9 @@ public static partial class FastTree
     {
         [TlcModule.EntryPoint(Name = "Trainers.FastTreeRegressor",
             Desc = FastTreeRegressionTrainer.Summary,
-            Remarks = FastTreeRegressionTrainer.Remarks,
             UserName = FastTreeRegressionTrainer.UserNameValue,
-            ShortName = FastTreeRegressionTrainer.ShortName)]
+            ShortName = FastTreeRegressionTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
index 6d9bd58273..cf8c1fd8b2 100644
--- a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
@@ -27,21 +27,15 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <summary>
-    /// The Tweedie boosting model follows the mathematics established in:
-    /// Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
-    /// https://arxiv.org/pdf/1508.06378.pdf
-    /// </summary>
+    // The Tweedie boosting model follows the mathematics established in:
+    // Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
+    // https://arxiv.org/pdf/1508.06378.pdf
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
     public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase<FastTreeTweedieTrainer.Arguments, FastTreeTweediePredictor>
     {
         public const string LoadNameValue = "FastTreeTweedieRegression";
         public const string UserNameValue = "FastTree (Boosted Trees) Tweedie Regression";
         public const string Summary = "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.";
-        new public const string Remarks = @"<remarks>
-<a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>
-<a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a>
-</remarks>";
-
         public const string ShortName = "fttweedie";
 
         private TestHistory _firstTestSetHistory;
@@ -466,7 +460,8 @@ public static partial class FastTree
         [TlcModule.EntryPoint(Name = "Trainers.FastTreeTweedieRegressor",
             Desc = FastTreeTweedieTrainer.Summary,
             UserName = FastTreeTweedieTrainer.UserNameValue,
-            ShortName = FastTreeTweedieTrainer.ShortName)]
+            ShortName = FastTreeTweedieTrainer.ShortName,
+            XmlInclude = new [] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForest.cs b/src/Microsoft.ML.FastTree/RandomForest.cs
index 2f670539b4..88676754d5 100644
--- a/src/Microsoft.ML.FastTree/RandomForest.cs
+++ b/src/Microsoft.ML.FastTree/RandomForest.cs
@@ -12,28 +12,6 @@ public abstract class RandomForestTrainerBase<TArgs, TPredictor> : FastTreeTrain
         where TArgs : FastForestArgumentsBase, new()
         where TPredictor : IPredictorProducing<Float>
     {
-        new internal const string Remarks = @"<remarks>
-Decision trees are non-parametric models that perform a sequence of simple tests on inputs. 
-This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed. 
-A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
-<para>Decision trees have several advantages:</para>
-<list type='bullet'>
-<item>They are efficient in both computation and memory usage during training and prediction. </item>
-<item>They can represent non-linear decision boundaries.</item>
-<item>They perform integrated feature selection and classification. </item>
-<item>They are resilient in the presence of noisy features.</item>
-</list>
-Fast forest is a random forest implementation. 
-The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction. 
-An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
-This decision forest classifier consists of an ensemble of decision trees. 
-Generally, ensemble models provide better coverage and accuracy than single decision trees. 
-Each tree in a decision forest outputs a Gaussian distribution.
-<a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
-<a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
-<a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
-</remarks>";
-
         private readonly bool _quantileEnabled;
 
         protected RandomForestTrainerBase(IHostEnvironment env, TArgs args, bool quantileEnabled = false)
diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
index e3e265cf13..43ed8b0561 100644
--- a/src/Microsoft.ML.FastTree/RandomForestClassification.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
@@ -106,6 +106,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         }
     }
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestClassification :
         RandomForestTrainerBase<FastForestClassification.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -210,9 +211,9 @@ public static partial class FastForest
     {
         [TlcModule.EntryPoint(Name = "Trainers.FastForestBinaryClassifier",
             Desc = FastForestClassification.Summary,
-            Remarks = FastForestClassification.Remarks,
             UserName = FastForestClassification.UserNameValue,
-            ShortName = FastForestClassification.ShortName)]
+            ShortName = FastForestClassification.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
index ef9b3e5e9b..8483d99f01 100644
--- a/src/Microsoft.ML.FastTree/RandomForestRegression.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
@@ -137,6 +137,7 @@ public ISchemaBindableMapper CreateMapper(Double[] quantiles)
         }
     }
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestRegression : RandomForestTrainerBase<FastForestRegression.Arguments, FastForestRegressionPredictor>
     {
         public sealed class Arguments : FastForestArgumentsBase
@@ -282,9 +283,9 @@ public static partial class FastForest
     {
         [TlcModule.EntryPoint(Name = "Trainers.FastForestRegressor",
             Desc = FastForestRegression.Summary,
-            Remarks = FastForestRegression.Remarks,
             UserName = FastForestRegression.LoadNameValue,
-            ShortName = FastForestRegression.ShortName)]
+            ShortName = FastForestRegression.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
index df386d0dc1..a8bb68b411 100644
--- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
+++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
@@ -28,6 +28,7 @@
 
 namespace Microsoft.ML.Runtime.KMeans
 {
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="KMeans++"]/*' />
     public class KMeansPlusPlusTrainer : TrainerBase<RoleMappedData, KMeansPredictor>
     {
         public const string LoadNameValue = "KMeansPlusPlus";
@@ -36,14 +37,6 @@ public class KMeansPlusPlusTrainer : TrainerBase<RoleMappedData, KMeansPredictor
         internal const string Summary = "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified "
             + "number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better "
             + "method for choosing the initial cluster centers.";
-        internal const string Remarks = @"<remarks>
-K-means++ improves upon K-means by using the <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=252149'>Yinyang K-Means</a> method for choosing the initial cluster centers.
-YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).   
-YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration. 
-It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations. 
-<a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.
-<a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a>
-</remarks>";
 
         public enum InitAlgorithm
         {
@@ -235,9 +228,9 @@ private static int ComputeNumThreads(IHost host, int? argNumThreads)
 
         [TlcModule.EntryPoint(Name = "Trainers.KMeansPlusPlusClusterer",
             Desc = Summary,
-            Remarks = Remarks,
             UserName = UserNameValue,
-            ShortName = ShortName)]
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""KMeans++""]/*' />" })]
         public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
index 0b71bfa70e..45429d6c56 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
@@ -81,6 +81,7 @@ public static IPredictorProducing<float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase<float, IPredictorWithFeatureWeights<float>>
     {
         internal const string UserName = "LightGBM Binary Classifier";
@@ -131,9 +132,9 @@ public static partial class LightGbm
         [TlcModule.EntryPoint(
             Name = "Trainers.LightGbmBinaryClassifier", 
             Desc = LightGbmBinaryTrainer.Summary,
-            Remarks = LightGbmBinaryTrainer.Remarks,
             UserName = LightGbmBinaryTrainer.UserName, 
-            ShortName = LightGbmBinaryTrainer.ShortName)]
+            ShortName = LightGbmBinaryTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
index 479be65bec..5bc2838aea 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
@@ -18,6 +18,7 @@
 namespace Microsoft.ML.Runtime.LightGBM
 {
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmMulticlassTrainer : LightGbmTrainerBase<VBuffer<float>, OvaPredictor>
     {
         public const string Summary = "LightGBM Multi Class Classifier";
@@ -182,9 +183,9 @@ public static partial class LightGbm
         [TlcModule.EntryPoint(
             Name = "Trainers.LightGbmClassifier", 
             Desc = "Train a LightGBM multi class model.", 
-            Remarks = LightGbmMulticlassTrainer.Remarks,
             UserName = LightGbmMulticlassTrainer.Summary, 
-            ShortName = LightGbmMulticlassTrainer.ShortName)]
+            ShortName = LightGbmMulticlassTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
index 2ed436b4eb..1a61ab4907 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
@@ -71,6 +71,7 @@ public static LightGbmRankingPredictor Create(IHostEnvironment env, ModelLoadCon
         public override PredictionKind PredictionKind { get { return PredictionKind.Ranking; } }
     }
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, LightGbmRankingPredictor>
     {
         public const string UserName = "LightGBM Ranking";
@@ -123,15 +124,15 @@ protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, Role
     }
 
     /// <summary>
-    /// A component to train a LightGBM model.
+    /// The entry point for the LightGbmRankingTrainer.
     /// </summary>
     public static partial class LightGbm
     {
         [TlcModule.EntryPoint(Name = "Trainers.LightGbmRanker", 
-            Remarks = LightGbmMulticlassTrainer.Remarks,
             Desc = "Train a LightGBM ranking model.", 
             UserName = LightGbmRankingTrainer.UserName, 
-            ShortName = LightGbmRankingTrainer.ShortName)]
+            ShortName = LightGbmRankingTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
index 36c82aa79a..4b2d0f2a3b 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
@@ -20,7 +20,7 @@
 
 namespace Microsoft.ML.Runtime.LightGBM
 {
-
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRegressionPredictor : FastTreePredictionWrapper
     {
         public const string LoaderSignature = "LightGBMRegressionExec";
@@ -122,9 +122,9 @@ public static partial class LightGbm
     {
         [TlcModule.EntryPoint(Name = "Trainers.LightGbmRegressor", 
             Desc = LightGbmRegressorTrainer.Summary, 
-            Remarks = LightGbmRegressorTrainer.Remarks,
             UserName = LightGbmRegressorTrainer.UserNameValue, 
-            ShortName = LightGbmRegressorTrainer.ShortName)]
+            ShortName = LightGbmRegressorTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs
index a93fa2ad60..c778c4ee23 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmTrainerBase.cs
@@ -58,9 +58,6 @@ private sealed class CategoricalMetaData
         protected int FeatureCount;
         protected FastTree.Internal.Ensemble TrainedEnsemble;
 
-        internal const string Remarks = @"<remarks>Light GBM is an open source implementation of boosted trees.
-<a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a></remarks>";
-
         #endregion
 
         protected LightGbmTrainerBase(IHostEnvironment env, LightGbmArguments args, PredictionKind predictionKind, string name)
diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index 6c114ef14d..2487ae64ba 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -286,9 +286,9 @@ private static void PostProcess(VBuffer<Float>[] y, Float[] sigma, Float[] z, in
 
         [TlcModule.EntryPoint(Name = "Trainers.PcaAnomalyDetector",
             Desc = "Train an PCA Anomaly model.",
-            Remarks = PcaPredictor.Remarks,
             UserName = UserNameValue,
-            ShortName = ShortName)]
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -302,13 +302,13 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
         }
     }
 
-    /// <summary>
-    /// An anomaly detector using PCA.
-    /// - The algorithm uses the top eigenvectors to approximate the subspace containing the normal class
-    /// - For each new instance, it computes the norm difference between the raw feature vector and the projected feature on that subspace.
-    /// - - If the error is close to 0, the instance is considered normal (non-anomaly).
-    /// </summary>
+    // An anomaly detector using PCA.
+    // - The algorithm uses the top eigenvectors to approximate the subspace containing the normal class
+    // - For each new instance, it computes the norm difference between the raw feature vector and the projected feature on that subspace.
+    // - - If the error is close to 0, the instance is considered normal (non-anomaly).
     // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
+    // REVIEW: Include the above detail in the XML documentation file. 
+    /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="SDCA"]/*' />
     public sealed class PcaPredictor : PredictorBase<Float>,
         IValueMapper,
         ICanGetSummaryAsIDataView,
@@ -316,14 +316,6 @@ public sealed class PcaPredictor : PredictorBase<Float>,
     {
         public const string LoaderSignature = "pcaAnomExec";
         public const string RegistrationName = "PCAPredictor";
-        internal const string Remarks = @"<remarks>
-<a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace.
-Its training is done using the technique described in the paper: <a href='https://arxiv.org/pdf/1310.6304v2.pdf'>Combining Structured and Unstructured Randomness in Large Scale PCA</a>, 
-and the paper <see href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</see>
-<a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
-<a href='https://arxiv.org/abs/0809.2274'>A randomized algorithm for principal component analysis</a>
-<a href='http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
-</remarks>";
 
         private static VersionInfo GetVersionInfo()
         {
diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs
index cef2264677..bcac0386b1 100644
--- a/src/Microsoft.ML.PCA/PcaTransform.cs
+++ b/src/Microsoft.ML.PCA/PcaTransform.cs
@@ -26,6 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
+    /// <include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
@@ -536,8 +537,10 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer<Float>
             dst = new VBuffer<Float>(transformInfo.Rank, values, dst.Indices);
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator", Desc = "Train an PCA Anomaly model.",
-            UserName = UserName, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator", 
+            UserName = UserName, 
+            ShortName = ShortName, 
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input);
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
index 7a0a099031..dacbe1a33f 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
@@ -22,13 +22,14 @@
 
 namespace Microsoft.ML.Runtime.FactorizationMachine
 {
-    /// <summary>
-    /// Train a field-aware factorization machine using ADAGRAD (an advanced stochastic gradient method). See references below
-    /// for details. This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
-    /// [1] http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
-    /// [2] http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
-    /// [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
-    /// </summary>
+    /*
+     Train a field-aware factorization machine using ADAGRAD (an advanced stochastic gradient method). See references below
+     for details. This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
+     [1] http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
+     [2] http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
+     [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
+    */
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
     public sealed class FieldAwareFactorizationMachineTrainer : TrainerBase<RoleMappedData, FieldAwareFactorizationMachinePredictor>,
         IIncrementalTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>, IValidatingTrainer<RoleMappedData>,
         IIncrementalValidatingTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>
@@ -37,15 +38,6 @@ public sealed class FieldAwareFactorizationMachineTrainer : TrainerBase<RoleMapp
         public const string UserName = "Field-aware Factorization Machine";
         public const string LoadName = "FieldAwareFactorizationMachine";
         public const string ShortName = "ffm";
-        internal const string Remarks = @"<remarks>
-Field Aware Factorization Machines use, in addition to the input variables, factorized parameters to model the interaction between pairs of variables.
-The algorithm is particularly useful for high dimensional datasets which can be very sparse (e.g. click-prediction for advertising systems).
-An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
-<a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
-<a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a>
-<a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
-<a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a>
-</remarks>";
 
         public sealed class Arguments : LearnerInputBaseWithLabel
         {
@@ -413,9 +405,9 @@ public override FieldAwareFactorizationMachinePredictor CreatePredictor()
 
         [TlcModule.EntryPoint(Name = "Trainers.FieldAwareFactorizationMachineBinaryClassifier",
             Desc = Summary,
-            Remarks = Remarks,
             UserName = UserName,
-            ShortName = ShortName)]
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
index 738c0197cb..dd167b884b 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
@@ -222,26 +222,6 @@ internal virtual void Check(IHostEnvironment env)
             }
         }
 
-        internal const string Remarks = @"<remarks>
-This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
-The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
-that supports multi-threading.
-<para>
-Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
-Several choices of loss functions are also provided.
-The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
-</para>
-<para>
-Note that SDCA is a stochastic and streaming optimization algorithm. 
-The results depends on the order of the training data. For reproducible results, it is recommended that one sets <paramref name='Shuffle'/> to
-False and <paramref name='NumThreads'/> to 1.
-Elastic net regularization can be specified by the <paramref name='L2Const'/> and <paramref name='L1Threshold'/> parameters. Note that the <paramref name='L2Const'/> has an effect on the rate of convergence. 
-In general, the larger the <paramref name='L2Const'/>, the faster SDCA converges.
-</para>
-<a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
-<a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
-</remarks>";
-
         // The order of these matter, since they are used as indices into arrays.
         protected enum MetricKind
         {
@@ -1797,9 +1777,9 @@ public static partial class Sdca
     {
         [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentBinaryClassifier",
             Desc = "Train an SDCA binary model.",
-            Remarks = LinearClassificationTrainer.Remarks,
             UserName = LinearClassificationTrainer.UserNameValue,
-            ShortName = LinearClassificationTrainer.LoadNameValue)]
+            ShortName = LinearClassificationTrainer.LoadNameValue,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""StochasticDualCoordinateAscentBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LinearClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
index 95982047aa..89f4866228 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LbfgsPredictorBase.cs
@@ -94,35 +94,6 @@ public abstract class ArgumentsBase : LearnerInputBaseWithWeight
             public bool EnforceNonNegativity = false;
         }
 
-        internal const string Remarks = @"<remarks>
-If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-<para>
-The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
-Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
-But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, 
-so that it is especially suited for problems with a large number of variables. 
-The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-</para>
-<para>
-This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
-Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
-This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. 
-Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
-An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-<list type='bullet'>
-<item><paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data. 
-It pulls small weights associated features that are relatively unimportant towards 0.</item>
-<item><paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero. </item>
-</list>
-Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
-The default values of x and y are both 1. 
-An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-<a href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</a>.
-<a href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</a>.
-<a href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</a>.
-<a href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</a>.
-</remarks>";
-
         protected int NumFeatures;
         protected VBuffer<Float> CurrentWeights;
         protected long NumGoodRows;
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index f1d35950ba..504bdca5f6 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -30,6 +30,8 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = Microsoft.ML.Runtime.Learners.OlsLinearRegressionTrainer.Mkl;
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
     public sealed partial class LogisticRegression : LbfgsTrainerBase<Float, ParameterMixingCalibratedPredictor>
     {
         public const string LoadNameValue = "LogisticRegression";
@@ -388,9 +390,11 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
 
         [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionBinaryClassifier",
             Desc = Summary,
-            Remarks = Remarks,
             UserName = UserNameValue,
-            ShortName = ShortName)]
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
+                            
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 3cf22d98fa..27e86682bd 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -36,6 +36,8 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
+    /// <include file = '../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file = '../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
     public sealed class MulticlassLogisticRegression : LbfgsTrainerBase<VBuffer<Float>, MulticlassLogisticRegressionPredictor>
     {
         public const string LoadNameValue = "MultiClassLogisticRegression";
@@ -962,9 +964,10 @@ public partial class LogisticRegression
     {
         [TlcModule.EntryPoint(Name = "Trainers.LogisticRegressionClassifier",
             Desc = Summary,
-            Remarks = MulticlassLogisticRegression.Remarks,
             UserName = MulticlassLogisticRegression.UserNameValue,
-            ShortName = MulticlassLogisticRegression.ShortName)]
+            ShortName = MulticlassLogisticRegression.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index ac259f66db..50ddc7fadb 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -23,13 +23,12 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <summary>
-    /// This is an averaged perceptron classifier.
-    /// Configurable subcomponents:
-    ///     - Loss function. By default, hinge loss (aka max-margin avgd perceptron)
-    ///     - Feature normalization. By default, rescaling between min and max values for every feature
-    ///     - Prediction calibration to produce probabilities. Off by default, if on, uses exponential (aka Platt) calibration.
-    /// </summary>
+    // This is an averaged perceptron classifier.
+    // Configurable subcomponents:
+    //     - Loss function. By default, hinge loss (aka max-margin avgd perceptron)
+    //     - Feature normalization. By default, rescaling between min and max values for every feature
+    //     - Prediction calibration to produce probabilities. Off by default, if on, uses exponential (aka Platt) calibration.
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="AP"]/*' />
     public sealed class AveragedPerceptronTrainer :
         AveragedLinearTrainer<AveragedPerceptronTrainer.Arguments, LinearBinaryPredictor>
     {
@@ -37,25 +36,6 @@ public sealed class AveragedPerceptronTrainer :
         internal const string UserNameValue = "Averaged Perceptron";
         internal const string ShortName = "ap";
         internal const string Summary = "Averaged Perceptron Binary Classifier.";
-        internal const string Remarks = @"<remarks>
-Perceptron is a classification algorithm that makes its predictions based on a linear function.
-I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
-<para>
-Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
-The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
-If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
-the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
-multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
-and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
-</para>
-<para>
-In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
-together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
-The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
-</para>
-<a href='https://en.wikipedia.org/wiki/Perceptron'>Wikipedia entry for Perceptron</a>
-<a href='http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.8200'>Large Margin Classification Using the Perceptron Algorithm</a>
-</remarks>";
 
         public class Arguments : AveragedLinearArguments
         {
@@ -112,9 +92,9 @@ public override LinearBinaryPredictor CreatePredictor()
 
         [TlcModule.EntryPoint(Name = "Trainers.AveragedPerceptronBinaryClassifier",
             Desc = Summary,
-            Remarks = Remarks,
             UserName = UserNameValue,
-            ShortName = ShortName)]
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""AP""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
index dee63ccf37..c67f10e2c0 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
@@ -27,6 +27,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TPredictor = LinearRegressionPredictor;
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="OGD"]/*' />
     public sealed class OnlineGradientDescentTrainer : AveragedLinearTrainer<OnlineGradientDescentTrainer.Arguments, TPredictor>
     {
         internal const string LoadNameValue = "OnlineGradientDescent";
@@ -34,11 +35,6 @@ public sealed class OnlineGradientDescentTrainer : AveragedLinearTrainer<OnlineG
         internal const string Summary = "Stochastic gradient descent is an optimization method used to train a wide range of models in machine learning. "
             + "In the TLC implementation of OGD, it is for linear regression.";
         internal const string ShortName = "ogd";
-        internal const string Remarks = @"<remarks>
-Stochastic gradient descent uses a simple yet efficient iterative technique to fit model coefficients using error gradients for convex loss functions.
-The OnlineGradientDescentRegressor implements the standard (non-batch) SGD, with a choice of loss functions,
-and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
-</remarks>";
 
         public sealed class Arguments : AveragedLinearArguments
         {
@@ -96,9 +92,9 @@ public override TPredictor CreatePredictor()
 
         [TlcModule.EntryPoint(Name = "Trainers.OnlineGradientDescentRegressor",
             Desc = "Train a Online gradient descent perceptron.",
-            Remarks = Remarks,
             UserName = UserNameValue,
-            ShortName = ShortName)]
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""OGD""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index c5ad4b4495..4d407fbfa7 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -26,17 +26,13 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
     public sealed class PoissonRegression : LbfgsTrainerBase<Float, PoissonRegressionPredictor>
     {
         internal const string LoadNameValue = "PoissonRegression";
         internal const string UserNameValue = "Poisson Regression";
         internal const string ShortName = "PR";
         internal const string Summary = "Poisson Regression assumes the unknown function, denoted Y has a Poisson distribution.";
-        new internal const string Remarks = @"<remarks>
-<a href='https://en.wikipedia.org/wiki/Poisson_regression'>Poisson regression</a> is a parameterized regression method. 
-It assumes that the log of the conditional mean of the dependent variable follows a linear function of the dependent variables. 
-Assuming that the dependent variable follows a Poisson distribution, the parameters of the regressor can be estimated by maximizing the likelihood of the obtained observations.
-</remarks>";
 
         public sealed class Arguments : ArgumentsBase
         {
@@ -129,7 +125,11 @@ protected override void ProcessPriorDistribution(Float label, Float weight)
             // No-op by design.
         }
 
-        [TlcModule.EntryPoint(Name = "Trainers.PoissonRegressor", Desc = "Train an Poisson regression model.", UserName = UserNameValue, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Name = "Trainers.PoissonRegressor", 
+            Desc = "Train an Poisson regression model.", 
+            UserName = UserNameValue, 
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""PoissonRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
index 9b00251139..08a13ac7a1 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
@@ -28,9 +28,8 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TVectorPredictor = IPredictorProducing<VBuffer<Float>>;
 
-    /// <summary>
-    /// SDCA linear multiclass trainer.
-    /// </summary>
+    // SDCA linear multiclass trainer.
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
     public class SdcaMultiClassTrainer : SdcaTrainerBase<TVectorPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAMC";
@@ -382,15 +381,15 @@ protected override Float GetInstanceWeight(FloatLabelCursor cursor)
     }
 
     /// <summary>
-    /// A component to train an SDCA model.
+    /// The Entry Point for SDCA multiclass.
     /// </summary>
     public static partial class Sdca
     {
         [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentClassifier",
             Desc = SdcaMultiClassTrainer.Summary,
-            Remarks = SdcaMultiClassTrainer.Remarks,
             UserName = SdcaMultiClassTrainer.UserNameValue,
-            ShortName = SdcaMultiClassTrainer.ShortName)]
+            ShortName = SdcaMultiClassTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
index 422b63f397..cdd4ce6411 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
@@ -25,6 +25,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TScalarPredictor = IPredictorWithFeatureWeights<Float>;
 
+    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
     public sealed class SdcaRegressionTrainer : SdcaTrainerBase<IPredictor>, ITrainer<RoleMappedData, TScalarPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAR";
@@ -127,15 +128,15 @@ protected override Float TuneDefaultL2(IChannel ch, int maxIterations, long rowC
     }
 
     /// <summary>
-    /// A component to train an SDCA model.
+    ///The Entry Point for the SDCA regressor.
     /// </summary>
     public static partial class Sdca
     {
         [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentRegressor",
             Desc = SdcaRegressionTrainer.Summary,
-            Remarks = SdcaRegressionTrainer.Remarks,
             UserName = SdcaRegressionTrainer.UserNameValue,
-            ShortName = SdcaRegressionTrainer.ShortName)]
+            ShortName = SdcaRegressionTrainer.ShortName,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
index 42f506930a..dace70510a 100644
--- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
@@ -19,6 +19,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
+    /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
     public static class CategoricalHashTransform
     {
         public const int NumBitsLim = 31; // can't convert 31-bit hashes to indicator vectors, so max is 30
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index 70eed46248..2f0133a0c3 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -21,19 +21,7 @@
 [assembly: LoadableClass(typeof(void), typeof(Categorical), null, typeof(SignatureEntryPointModule), "Categorical")]
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// Categorical trans.
-    /// Each column can specify an output kind, Bag, Ind, or Key.
-    /// Notes:
-    /// * Each column builds/uses exactly one "vocabulary" (dictionary).
-    /// * The Key output kind produces integer values and KeyType columns.
-    /// * The Key value is the one-based index of the slot set in the Ind/Bag options.
-    /// * In the Key option, not found is assigned the value zero.
-    /// * In the Ind/Bag options, not found results in an all zero bit vector.
-    /// * Ind and Bag differ simply in how the bit-vectors generated from individual slots are aggregated:
-    ///   for Ind they are concatenated and for Bag they are added.
-    /// * When the source column is a singleton, the Ind and Bag options are identical.
-    /// </summary>
+    /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
     public static class CategoricalTransform
     {
         public enum OutputKind : byte
@@ -255,7 +243,9 @@ public static IDataTransform CreateTransformCore(
 
     public static class Categorical
     {
-        [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", Desc = "Encodes the categorical variable with one-hot encoding based on term dictionary", UserName = CategoricalTransform.UserName)]
+        [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", 
+            UserName = CategoricalTransform.UserName, 
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, CategoricalTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -267,7 +257,9 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en
             return new CommonOutputs.TransformOutput { Model = new TransformModel(env, xf, input.Data), OutputData = xf };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", Desc = "Encodes the categorical variable with hash-based encoding", UserName = CategoricalHashTransform.UserName)]
+        [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", 
+            UserName = CategoricalHashTransform.UserName ,
+            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, CategoricalHashTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index a0f7f17975..0fe478c632 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -4095,28 +4095,7 @@ public sealed class Output
     namespace Trainers
     {
 
-        /// <summary>
-        /// Averaged Perceptron Binary Classifier.
-        /// </summary>
-        /// <remarks>
-        /// Perceptron is a classification algorithm that makes its predictions based on a linear function.
-        /// I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
-        /// <para>
-        /// Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
-        /// The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed. 
-        /// If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
-        /// the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
-        /// multiplied by a factor 0 < a <= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate, 
-        /// and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
-        /// </para>
-        /// <para>
-        /// In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored, 
-        /// together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
-        /// The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
-        /// </para>
-        /// <a href='https://en.wikipedia.org/wiki/Perceptron'>Wikipedia entry for Perceptron</a>
-        /// <a href='http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.8200'>Large Margin Classification Using the Perceptron Algorithm</a>
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="AP"]/*' />
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4620,30 +4599,7 @@ public enum Bundle : byte
         }
 
 
-        /// <summary>
-        /// Uses a random forest learner to perform binary classification.
-        /// </summary>
-        /// <remarks>
-        /// Decision trees are non-parametric models that perform a sequence of simple tests on inputs. 
-        /// This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed. 
-        /// A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
-        /// <para>Decision trees have several advantages:</para>
-        /// <list type='bullet'>
-        /// <item>They are efficient in both computation and memory usage during training and prediction. </item>
-        /// <item>They can represent non-linear decision boundaries.</item>
-        /// <item>They perform integrated feature selection and classification. </item>
-        /// <item>They are resilient in the presence of noisy features.</item>
-        /// </list>
-        /// Fast forest is a random forest implementation. 
-        /// The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction. 
-        /// An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
-        /// This decision forest classifier consists of an ensemble of decision trees. 
-        /// Generally, ensemble models provide better coverage and accuracy than single decision trees. 
-        /// Each tree in a decision forest outputs a Gaussian distribution.
-        /// <a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
-        /// <a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
-        /// <a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4934,30 +4890,7 @@ public FastForestBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Trains a random forest to fit target values using least-squares.
-        /// </summary>
-        /// <remarks>
-        /// Decision trees are non-parametric models that perform a sequence of simple tests on inputs. 
-        /// This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed. 
-        /// A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
-        /// <para>Decision trees have several advantages:</para>
-        /// <list type='bullet'>
-        /// <item>They are efficient in both computation and memory usage during training and prediction. </item>
-        /// <item>They can represent non-linear decision boundaries.</item>
-        /// <item>They perform integrated feature selection and classification. </item>
-        /// <item>They are resilient in the presence of noisy features.</item>
-        /// </list>
-        /// Fast forest is a random forest implementation. 
-        /// The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction. 
-        /// An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
-        /// This decision forest classifier consists of an ensemble of decision trees. 
-        /// Generally, ensemble models provide better coverage and accuracy than single decision trees. 
-        /// Each tree in a decision forest outputs a Gaussian distribution.
-        /// <a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
-        /// <a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
-        /// <a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5244,33 +5177,7 @@ public enum BoostedTreeArgsOptimizationAlgorithmType
         }
 
 
-        /// <summary>
-        /// Uses a logit-boost boosted tree learner to perform binary classification.
-        /// </summary>
-        /// <remarks>
-        /// <para>FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm. 
-        /// Gradient boosting is a machine learning technique for regression problems. 
-        /// It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next. 
-        /// So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
-        /// </para>
-        /// <para>
-        /// MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves. 
-        /// A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input. 
-        /// At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x <= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature. 
-        /// The functions that can be produced by a regression tree are all the piece-wise constant functions.
-        /// </para>
-        /// <para>
-        /// The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
-        /// The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
-        /// </para>
-        /// <list type='bullet'>
-        /// <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
-        /// <item>In case of a regression problem, the output is the predicted value of the function.</item>
-        /// <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
-        /// </list>
-        /// <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
-        /// <a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5659,33 +5566,7 @@ public FastTreeBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Trains gradient boosted decision trees to the LambdaRank quasi-gradient.
-        /// </summary>
-        /// <remarks>
-        /// <para>FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm. 
-        /// Gradient boosting is a machine learning technique for regression problems. 
-        /// It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next. 
-        /// So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
-        /// </para>
-        /// <para>
-        /// MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves. 
-        /// A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input. 
-        /// At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x <= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature. 
-        /// The functions that can be produced by a regression tree are all the piece-wise constant functions.
-        /// </para>
-        /// <para>
-        /// The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
-        /// The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
-        /// </para>
-        /// <list type='bullet'>
-        /// <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
-        /// <item>In case of a regression problem, the output is the predicted value of the function.</item>
-        /// <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
-        /// </list>
-        /// <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
-        /// <a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6109,33 +5990,7 @@ public FastTreeRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Trains gradient boosted decision trees to fit target values using least-squares.
-        /// </summary>
-        /// <remarks>
-        /// <para>FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm. 
-        /// Gradient boosting is a machine learning technique for regression problems. 
-        /// It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next. 
-        /// So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
-        /// </para>
-        /// <para>
-        /// MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves. 
-        /// A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input. 
-        /// At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x <= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature. 
-        /// The functions that can be produced by a regression tree are all the piece-wise constant functions.
-        /// </para>
-        /// <para>
-        /// The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
-        /// The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
-        /// </para>
-        /// <list type='bullet'>
-        /// <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
-        /// <item>In case of a regression problem, the output is the predicted value of the function.</item>
-        /// <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
-        /// </list>
-        /// <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
-        /// <a href='http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6519,9 +6374,7 @@ public FastTreeRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
         public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6910,18 +6763,7 @@ public FastTreeTweedieRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train a field-aware factorization machine for binary classification
-        /// </summary>
-        /// <remarks>
-        /// Field Aware Factorization Machines use, in addition to the input variables, factorized parameters to model the interaction between pairs of variables.
-        /// The algorithm is particularly useful for high dimensional datasets which can be very sparse (e.g. click-prediction for advertising systems).
-        /// An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
-        /// <a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
-        /// <a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a>
-        /// <a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
-        /// <a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a>
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
         public sealed partial class FieldAwareFactorizationMachineBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7348,17 +7190,7 @@ public enum KMeansPlusPlusTrainerInitAlgorithm
         }
 
 
-        /// <summary>
-        /// K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.
-        /// </summary>
-        /// <remarks>
-        /// K-means++ improves upon K-means by using the <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=252149'>Yinyang K-Means</a> method for choosing the initial cluster centers.
-        /// YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).   
-        /// YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration. 
-        /// It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations. 
-        /// <a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.
-        /// <a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a>
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="KMeans++"]/*' />
         public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7474,11 +7306,7 @@ public enum LightGbmArgumentsEvalMetricType
         }
 
 
-        /// <summary>
-        /// Train a LightGBM binary classification model.
-        /// </summary>
-        /// <remarks>Light GBM is an open source implementation of boosted trees.
-        /// <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a></remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7681,11 +7509,7 @@ public LightGbmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train a LightGBM multi class model.
-        /// </summary>
-        /// <remarks>Light GBM is an open source implementation of boosted trees.
-        /// <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a></remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7888,11 +7712,7 @@ public LightGbmClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train a LightGBM ranking model.
-        /// </summary>
-        /// <remarks>Light GBM is an open source implementation of boosted trees.
-        /// <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a></remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8098,8 +7918,6 @@ namespace Trainers
         /// <summary>
         /// LightGBM Regression
         /// </summary>
-        /// <remarks>Light GBM is an open source implementation of boosted trees.
-        /// <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a></remarks>
         public sealed partial class LightGbmRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8437,37 +8255,8 @@ public LinearSvmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
-        /// </summary>
-        /// <remarks>
-        /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-        /// <para>
-        /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
-        /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
-        /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, 
-        /// so that it is especially suited for problems with a large number of variables. 
-        /// The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-        /// </para>
-        /// <para>
-        /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
-        /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
-        /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. 
-        /// Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
-        /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-        /// <list type='bullet'>
-        /// <item><paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data. 
-        /// It pulls small weights associated features that are relatively unimportant towards 0.</item>
-        /// <item><paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero. </item>
-        /// </list>
-        /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
-        /// The default values of x and y are both 1. 
-        /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-        /// <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</a>.
-        /// <a href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</a>.
-        /// <a href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</a>.
-        /// <a href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8615,37 +8404,8 @@ public LogisticRegressionBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
-        /// </summary>
-        /// <remarks>
-        /// If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-        /// <para>
-        /// The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). 
-        /// Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. 
-        /// But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, 
-        /// so that it is especially suited for problems with a large number of variables. 
-        /// The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-        /// </para>
-        /// <para>
-        /// This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. 
-        /// Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. 
-        /// This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. 
-        /// Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. 
-        /// An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-        /// <list type='bullet'>
-        /// <item><paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data. 
-        /// It pulls small weights associated features that are relatively unimportant towards 0.</item>
-        /// <item><paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero. </item>
-        /// </list>
-        /// Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. 
-        /// The default values of x and y are both 1. 
-        /// An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-        /// <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</a>.
-        /// <a href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</a>.
-        /// <a href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</a>.
-        /// <a href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8869,11 +8629,6 @@ namespace Trainers
         /// <summary>
         /// Train a Online gradient descent perceptron.
         /// </summary>
-        /// <remarks>
-        /// Stochastic gradient descent uses a simple yet efficient iterative technique to fit model coefficients using error gradients for convex loss functions.
-        /// The OnlineGradientDescentRegressor implements the standard (non-batch) SGD, with a choice of loss functions,
-        /// and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
-        /// </remarks>
         public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9026,17 +8781,7 @@ public OnlineGradientDescentRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train an PCA Anomaly model.
-        /// </summary>
-        /// <remarks>
-        /// <a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace.
-        /// Its training is done using the technique described in the paper: <a href='https://arxiv.org/pdf/1310.6304v2.pdf'>Combining Structured and Unstructured Randomness in Large Scale PCA</a>, 
-        /// and the paper <see href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</see>
-        /// <a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
-        /// <a href='https://arxiv.org/abs/0809.2274'>A randomized algorithm for principal component analysis</a>
-        /// <a href='http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
         public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9130,9 +8875,7 @@ public PcaAnomalyDetectorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train an Poisson regression model.
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
         public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9275,28 +9018,7 @@ public PoissonRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train an SDCA binary model.
-        /// </summary>
-        /// <remarks>
-        /// This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
-        /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
-        /// that supports multi-threading.
-        /// <para>
-        /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
-        /// Several choices of loss functions are also provided.
-        /// The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
-        /// </para>
-        /// <para>
-        /// Note that SDCA is a stochastic and streaming optimization algorithm. 
-        /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets <paramref name='Shuffle'/> to
-        /// False and <paramref name='NumThreads'/> to 1.
-        /// Elastic net regularization can be specified by the <paramref name='L2Const'/> and <paramref name='L1Threshold'/> parameters. Note that the <paramref name='L2Const'/> has an effect on the rate of convergence. 
-        /// In general, the larger the <paramref name='L2Const'/>, the faster SDCA converges.
-        /// </para>
-        /// <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
-        /// <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="StochasticDualCoordinateAscentBinaryClassifier"]/*' />
         public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9435,28 +9157,7 @@ public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// The SDCA linear multi-class classification trainer.
-        /// </summary>
-        /// <remarks>
-        /// This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
-        /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
-        /// that supports multi-threading.
-        /// <para>
-        /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
-        /// Several choices of loss functions are also provided.
-        /// The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
-        /// </para>
-        /// <para>
-        /// Note that SDCA is a stochastic and streaming optimization algorithm. 
-        /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets <paramref name='Shuffle'/> to
-        /// False and <paramref name='NumThreads'/> to 1.
-        /// Elastic net regularization can be specified by the <paramref name='L2Const'/> and <paramref name='L1Threshold'/> parameters. Note that the <paramref name='L2Const'/> has an effect on the rate of convergence. 
-        /// In general, the larger the <paramref name='L2Const'/>, the faster SDCA converges.
-        /// </para>
-        /// <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
-        /// <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9579,28 +9280,7 @@ public StochasticDualCoordinateAscentClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// The SDCA linear regression trainer.
-        /// </summary>
-        /// <remarks>
-        /// This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
-        /// The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation 
-        /// that supports multi-threading.
-        /// <para>
-        /// Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
-        /// Several choices of loss functions are also provided.
-        /// The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
-        /// </para>
-        /// <para>
-        /// Note that SDCA is a stochastic and streaming optimization algorithm. 
-        /// The results depends on the order of the training data. For reproducible results, it is recommended that one sets <paramref name='Shuffle'/> to
-        /// False and <paramref name='NumThreads'/> to 1.
-        /// Elastic net regularization can be specified by the <paramref name='L2Const'/> and <paramref name='L1Threshold'/> parameters. Note that the <paramref name='L2Const'/> has an effect on the rate of convergence. 
-        /// In general, the larger the <paramref name='L2Const'/>, the faster SDCA converges.
-        /// </para>
-        /// <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
-        /// <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
-        /// </remarks>
+        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10203,9 +9883,7 @@ public sealed partial class CategoricalHashTransformColumn : OneToOneColumn<Cate
 
         }
 
-        /// <summary>
-        /// Encodes the categorical variable with hash-based encoding
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
         public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10378,9 +10056,7 @@ public sealed partial class CategoricalTransformColumn : OneToOneColumn<Categori
 
         }
 
-        /// <summary>
-        /// Encodes the categorical variable with one-hot encoding based on term dictionary
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
         public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14311,9 +13987,7 @@ public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColu
 
         }
 
-        /// <summary>
-        /// Train an PCA Anomaly model.
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
         public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
diff --git a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs
index 29884fe620..ba130c96e3 100644
--- a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs
+++ b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpApiGenerator.cs
@@ -382,7 +382,7 @@ private void GenerateInput(IndentingTextWriter writer, ModuleCatalog.EntryPointI
             GenerateEnums(writer, entryPointInfo.InputType, _defaultNamespace + entryPointMetadata.Namespace);
             writer.WriteLine();
             GenerateClasses(writer, entryPointInfo.InputType, catalog, _defaultNamespace + entryPointMetadata.Namespace);
-            CSharpGeneratorUtils.GenerateSummary(writer, entryPointInfo.Description, entryPointInfo.Remarks);
+            CSharpGeneratorUtils.GenerateSummary(writer, entryPointInfo.Description, entryPointInfo.XmlInclude);
 
             if (entryPointInfo.ObsoleteAttribute != null)
                 writer.WriteLine($"[Obsolete(\"{entryPointInfo.ObsoleteAttribute.Message}\")]");
diff --git a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpGeneratorUtils.cs b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpGeneratorUtils.cs
index 1cab5cc35c..09dea02cc2 100644
--- a/src/Microsoft.ML/Runtime/Internal/Tools/CSharpGeneratorUtils.cs
+++ b/src/Microsoft.ML/Runtime/Internal/Tools/CSharpGeneratorUtils.cs
@@ -349,18 +349,23 @@ public static string GetComponentName(ModuleCatalog.ComponentInfo component)
             return $"{Capitalize(component.Name)}{component.Kind}";
         }
 
-        public static void GenerateSummary(IndentingTextWriter writer, string summary, string remarks = null)
+        public static void GenerateSummary(IndentingTextWriter writer, string summary, string[] xmlInclude = null)
         {
+            // if the class has an XML <iclude> it should contain the summary and everything else 
+            if (xmlInclude != null)
+            {
+                foreach (var line in xmlInclude)
+                    writer.WriteLine($"/// {line}");
+
+                return;
+            }
+
             if (string.IsNullOrEmpty(summary))
                 return;
             writer.WriteLine("/// <summary>");
             foreach (var line in summary.Split(new[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries))
                 writer.WriteLine($"/// {line}");
             writer.WriteLine("/// </summary>");
-
-            if(!string.IsNullOrEmpty(remarks))
-                foreach (var line in remarks.Split(new[] { Environment.NewLine }, StringSplitOptions.None))
-                    writer.WriteLine($"/// {line}");
         }
 
         public static void GenerateHeader(IndentingTextWriter writer)

From 266952cf2071571cfde7d723b71cc49f9c65a9b9 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 9 Jul 2018 11:02:21 -0700
Subject: [PATCH 02/14] fixing the aftermath of renaming the XML file.

---
 src/Microsoft.ML.PCA/PcaTrainer.cs |  2 +-
 src/Microsoft.ML/CSharpApi.cs      | 48 ++++++++++++++----------------
 2 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index 2487ae64ba..ff8e74f122 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -308,7 +308,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
     // - - If the error is close to 0, the instance is considered normal (non-anomaly).
     // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
     // REVIEW: Include the above detail in the XML documentation file. 
-    /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='../../docs/code/xmlIncludes/PCA.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaPredictor : PredictorBase<Float>,
         IValueMapper,
         ICanGetSummaryAsIDataView,
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index 0fe478c632..d82ac58d4f 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -4095,7 +4095,7 @@ public sealed class Output
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="AP"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="AP"]/*' />
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4599,7 +4599,7 @@ public enum Bundle : byte
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4890,7 +4890,7 @@ public FastForestBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5177,7 +5177,7 @@ public enum BoostedTreeArgsOptimizationAlgorithmType
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5566,7 +5566,7 @@ public FastTreeBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5990,7 +5990,7 @@ public FastTreeRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6374,7 +6374,7 @@ public FastTreeRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
         public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6763,7 +6763,7 @@ public FastTreeTweedieRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
         public sealed partial class FieldAwareFactorizationMachineBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7190,7 +7190,7 @@ public enum KMeansPlusPlusTrainerInitAlgorithm
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="KMeans++"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="KMeans++"]/*' />
         public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7306,7 +7306,7 @@ public enum LightGbmArgumentsEvalMetricType
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7509,7 +7509,7 @@ public LightGbmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7712,7 +7712,7 @@ public LightGbmClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7915,9 +7915,7 @@ public LightGbmRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// LightGBM Regression
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8255,8 +8253,8 @@ public LinearSvmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LBFGS"]/*' />
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8404,8 +8402,8 @@ public LogisticRegressionBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="LBFGS"]/*' />
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8626,9 +8624,7 @@ public NaiveBayesClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train a Online gradient descent perceptron.
-        /// </summary>
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="OGD"]/*' />
         public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8875,7 +8871,7 @@ public PcaAnomalyDetectorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
         public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9018,7 +9014,7 @@ public PoissonRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="StochasticDualCoordinateAscentBinaryClassifier"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="StochasticDualCoordinateAscentBinaryClassifier"]/*' />
         public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9157,7 +9153,7 @@ public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9280,7 +9276,7 @@ public StochasticDualCoordinateAscentClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/StandardLearners.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 

From 0b17a0b5332e01d597fbfdeb1e3efb30f99e9d51 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 9 Jul 2018 13:21:52 -0700
Subject: [PATCH 03/14] removing the Desc from the EntryPoint attribute is a
 bad idea.

---
 src/Microsoft.ML.PCA/PcaTransform.cs                      | 1 +
 src/Microsoft.ML.Transforms/CategoricalTransform.cs       | 2 ++
 test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv   | 6 +++---
 test/BaselineOutput/Common/EntryPoints/core_manifest.json | 6 +++---
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs
index bcac0386b1..9dd48c600e 100644
--- a/src/Microsoft.ML.PCA/PcaTransform.cs
+++ b/src/Microsoft.ML.PCA/PcaTransform.cs
@@ -538,6 +538,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer<Float>
         }
 
         [TlcModule.EntryPoint(Name = "Transforms.PcaCalculator", 
+            Desc = Summary,
             UserName = UserName, 
             ShortName = ShortName, 
             XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index 2f0133a0c3..d2f7b5a154 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -244,6 +244,7 @@ public static IDataTransform CreateTransformCore(
     public static class Categorical
     {
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", 
+            Desc = CategoricalTransform.Summary,
             UserName = CategoricalTransform.UserName, 
             XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, CategoricalTransform.Arguments input)
@@ -258,6 +259,7 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en
         }
 
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", 
+            Desc = CategoricalHashTransform.Summary,
             UserName = CategoricalHashTransform.UserName ,
             XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, CategoricalHashTransform.Arguments input)
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index 9d250f4c7d..644318c05a 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -68,8 +68,8 @@ Trainers.StochasticGradientDescentBinaryClassifier	Train an Hogwild SGD binary m
 Transforms.ApproximateBootstrapSampler	Approximate bootstrap sampling.	Microsoft.ML.Runtime.Data.BootstrapSample	GetSample	Microsoft.ML.Runtime.Data.BootstrapSampleTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.BinaryPredictionScoreColumnsRenamer	For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.	Microsoft.ML.Runtime.EntryPoints.ScoreModel	RenameBinaryPredictionScoreColumns	Microsoft.ML.Runtime.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.BinNormalizer	The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.	Microsoft.ML.Runtime.Data.Normalize	Bin	Microsoft.ML.Runtime.Data.NormalizeTransform+BinArguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
-Transforms.CategoricalHashOneHotVectorizer	Encodes the categorical variable with hash-based encoding	Microsoft.ML.Runtime.Data.Categorical	CatTransformHash	Microsoft.ML.Runtime.Data.CategoricalHashTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
-Transforms.CategoricalOneHotVectorizer	Encodes the categorical variable with one-hot encoding based on term dictionary	Microsoft.ML.Runtime.Data.Categorical	CatTransformDict	Microsoft.ML.Runtime.Data.CategoricalTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
+Transforms.CategoricalHashOneHotVectorizer	Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it.	Microsoft.ML.Runtime.Data.Categorical	CatTransformHash	Microsoft.ML.Runtime.Data.CategoricalHashTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
+Transforms.CategoricalOneHotVectorizer	Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array.	Microsoft.ML.Runtime.Data.Categorical	CatTransformDict	Microsoft.ML.Runtime.Data.CategoricalTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.CharacterTokenizer	Character-oriented tokenizer where text is considered a sequence of characters.	Microsoft.ML.Runtime.Transforms.TextAnalytics	CharTokenize	Microsoft.ML.Runtime.TextAnalytics.CharTokenizeTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.ColumnConcatenator	Concatenates one or more columns of the same item type.	Microsoft.ML.Runtime.EntryPoints.SchemaManipulation	ConcatColumns	Microsoft.ML.Runtime.Data.ConcatTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.ColumnCopier	Duplicates columns from the dataset	Microsoft.ML.Runtime.EntryPoints.SchemaManipulation	CopyColumns	Microsoft.ML.Runtime.Data.CopyColumnsTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
@@ -106,7 +106,7 @@ Transforms.ModelCombiner	Combines a sequence of TransformModels into a single mo
 Transforms.NGramTranslator	Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.	Microsoft.ML.Runtime.Transforms.TextAnalytics	NGramTransform	Microsoft.ML.Runtime.Data.NgramTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.NoOperation	Does nothing.	Microsoft.ML.Runtime.Data.NopTransform	Nop	Microsoft.ML.Runtime.Data.NopTransform+NopInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.OptionalColumnCreator	If the source column does not exist after deserialization, create a column with the right type and default values.	Microsoft.ML.Runtime.DataPipe.OptionalColumnTransform	MakeOptional	Microsoft.ML.Runtime.DataPipe.OptionalColumnTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
-Transforms.PcaCalculator	Train an PCA Anomaly model.	Microsoft.ML.Runtime.Data.PcaTransform	Calculate	Microsoft.ML.Runtime.Data.PcaTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
+Transforms.PcaCalculator	PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace.	Microsoft.ML.Runtime.Data.PcaTransform	Calculate	Microsoft.ML.Runtime.Data.PcaTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.PredictedLabelColumnOriginalValueConverter	Transforms a predicted label column to its original values, unless it is of type bool.	Microsoft.ML.Runtime.EntryPoints.FeatureCombiner	ConvertPredictedLabel	Microsoft.ML.Runtime.EntryPoints.FeatureCombiner+PredictedLabelInput	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.RandomNumberGenerator	Adds a column with a generated number sequence.	Microsoft.ML.Runtime.Data.RandomNumberGenerator	Generate	Microsoft.ML.Runtime.Data.GenerateNumberTransform+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
 Transforms.RowRangeFilter	Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.	Microsoft.ML.Runtime.EntryPoints.SelectRows	FilterByRange	Microsoft.ML.Runtime.Data.RangeFilter+Arguments	Microsoft.ML.Runtime.EntryPoints.CommonOutputs+TransformOutput
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index 9b095d83ea..ed80555eb1 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -15823,7 +15823,7 @@
     },
     {
       "Name": "Transforms.CategoricalHashOneHotVectorizer",
-      "Desc": "Encodes the categorical variable with hash-based encoding",
+      "Desc": "Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it.",
       "FriendlyName": "Categorical Hash Transform",
       "ShortName": null,
       "Inputs": [
@@ -16029,7 +16029,7 @@
     },
     {
       "Name": "Transforms.CategoricalOneHotVectorizer",
-      "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary",
+      "Desc": "Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array.",
       "FriendlyName": "Categorical Transform",
       "ShortName": null,
       "Inputs": [
@@ -19779,7 +19779,7 @@
     },
     {
       "Name": "Transforms.PcaCalculator",
-      "Desc": "Train an PCA Anomaly model.",
+      "Desc": "PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace.",
       "FriendlyName": "Principal Component Analysis Transform",
       "ShortName": "Pca",
       "Inputs": [

From 98ffc798f7a9df3d944354b8e0b9547d29afa9c1 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 9 Jul 2018 14:40:23 -0700
Subject: [PATCH 04/14] removing the XML docs from the doc folder, and added
 them under the respective projects.

---
 docs/code/xmlIncludes/Learners.xml            | 274 ------------------
 .../FastTreeArguments.cs                      |   2 +-
 .../FastTreeClassification.cs                 |   3 +-
 src/Microsoft.ML.FastTree/FastTreeRanking.cs  |   4 +-
 .../FastTreeRegression.cs                     |   4 +-
 src/Microsoft.ML.FastTree/FastTreeTweedie.cs  |   4 +-
 .../RandomForestClassification.cs             |   4 +-
 .../RandomForestRegression.cs                 |   4 +-
 src/Microsoft.ML.FastTree/XMLDoc.xml          |  78 +++++
 .../KMeansPlusPlusTrainer.cs                  |   4 +-
 src/Microsoft.ML.KMeansClustering/XMLDoc.xml  |  22 ++
 .../LightGbmBinaryTrainer.cs                  |   4 +-
 .../LightGbmMulticlassTrainer.cs              |   4 +-
 .../LightGbmRankingTrainer.cs                 |   4 +-
 .../LightGbmRegressionTrainer.cs              |   4 +-
 src/Microsoft.ML.LightGBM/XMLDoc.xml          |  16 +
 .../Microsoft.ML.PCA}/Pca.xml                 |   0
 src/Microsoft.ML.PCA/PcaTrainer.cs            |   4 +-
 src/Microsoft.ML.PCA/PcaTransform.cs          |   4 +-
 .../FactorizationMachineTrainer.cs            |   4 +-
 .../FactorizationMachine/XMLDoc.xml           |  42 +++
 .../LogisticRegression/LogisticRegression.cs  |   8 +-
 .../MulticlassLogisticRegression.cs           |   8 +-
 .../Standard/LogisticRegression/XMLDoc.xml    |  67 +++++
 .../Standard/Online/AveragedPerceptron.cs     |   4 +-
 .../Standard/Online/OnlineGradientDescent.cs  |   4 +-
 .../Standard/Online/XMLDoc.xml                |  44 +++
 .../PoissonRegression/PoissonRegression.cs    |   4 +-
 .../Standard/PoissonRegression/XMLDoc.xml     |  17 ++
 .../Standard/SdcaMultiClass.cs                |   4 +-
 .../Standard/SdcaRegression.cs                |   4 +-
 .../Standard/XMLDoc.xml                       |  30 ++
 .../CategoricalHashTransform.cs               |   2 +-
 .../CategoricalTransform.cs                   |   6 +-
 .../Microsoft.ML.Transforms/XMLDoc.xml        |   0
 src/Microsoft.ML/CSharpApi.cs                 |  50 ++--
 36 files changed, 392 insertions(+), 349 deletions(-)
 delete mode 100644 docs/code/xmlIncludes/Learners.xml
 create mode 100644 src/Microsoft.ML.FastTree/XMLDoc.xml
 create mode 100644 src/Microsoft.ML.KMeansClustering/XMLDoc.xml
 create mode 100644 src/Microsoft.ML.LightGBM/XMLDoc.xml
 rename {docs/code/xmlIncludes => src/Microsoft.ML.PCA}/Pca.xml (100%)
 create mode 100644 src/Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml
 create mode 100644 src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml
 create mode 100644 src/Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml
 create mode 100644 src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml
 create mode 100644 src/Microsoft.ML.StandardLearners/Standard/XMLDoc.xml
 rename docs/code/xmlIncludes/Transforms.xml => src/Microsoft.ML.Transforms/XMLDoc.xml (100%)

diff --git a/docs/code/xmlIncludes/Learners.xml b/docs/code/xmlIncludes/Learners.xml
deleted file mode 100644
index 3c3ddd6bf0..0000000000
--- a/docs/code/xmlIncludes/Learners.xml
+++ /dev/null
@@ -1,274 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
-  <members>
-    
-    <member name="FieldAwareFactorizationMachineBinaryClassifier">
-      <summary>
-        Train a field-aware factorization machine for binary classification using ADAGRAD (an advanced stochastic gradient method). 
-      </summary>
-      <remarks>
-        Field Aware Factorization Machines use, in addition to the input variables, factorized parameters to model the interaction between pairs of variables.
-        The algorithm is particularly useful for high dimensional datasets which can be very sparse (e.g. click-prediction for advertising systems).
-        An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
-        <para> For a general idea of what Field-aware Factorization Machines are see: <a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
-        </para>
-        <para>See references below for more details. 
-        This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
-        </para>
-          <list >
-            <item>
-              <description>
-                [1] <a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
-            <item>
-              <description>
-                [2] <a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
-              </description>
-            </item>
-            <item>
-              <description>
-                [3] <a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a>
-              </description>
-            </item>
-          </list>
-      </remarks>
-      <example>
-        <code>
-          pipeline.Add(new FieldAwareFactorizationMachineBinaryClassifier(){ LearningRate = 0.5f, Iter=2 });
-        </code>
-      </example>
-    </member>
-
-    <member name="SDCA">
-      <summary>
-        Train an SDCA linear model.
-      </summary>
-      <remarks>
-        This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
-        The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
-        <para>
-          Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
-          Several choices of loss functions are also provided.
-          The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
-        </para>
-        <para>
-          Note that SDCA is a stochastic and streaming optimization algorithm.
-          The results depends on the order of the training data. For reproducible results, it is recommended that one sets 'Shuffle' to
-          False and 'NumThreads' to 1.
-          Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
-          In general, the larger the 'L2Const', the faster SDCA converges.
-        </para>
-        <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
-        <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
-      </remarks>
-    </member>
-
-    <member name="FastTree">
-      <summary>
-        Trains gradient boosted decision trees to the LambdaRank quasi-gradient. 
-      </summary>
-      <remarks>
-        <para>
-          FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
-          Gradient boosting is a machine learning technique for regression problems.
-          It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next.
-          So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
-        </para>
-        <para>
-          MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves.
-          A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input.
-          At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x &lt;= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature.
-          The functions that can be produced by a regression tree are all the piece-wise constant functions.
-        </para>
-        <para>
-          The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
-          The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
-        </para>
-        <list type='bullet'>
-          <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
-          <item>In case of a regression problem, the output is the predicted value of the function.</item>
-          <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
-        </list>
-        <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
-        <a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
-      </remarks>
-    </member>
-    
-    <member name="FastForest">
-      <summary>
-        Trains a random forest to fit target values using least-squares.
-      </summary>
-      <remarks>
-        Decision trees are non-parametric models that perform a sequence of simple tests on inputs.
-        This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed.
-        A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
-        <para>Decision trees have several advantages:</para>
-        <list type='bullet'>
-          <item><description>They are efficient in both computation and memory usage during training and prediction. </description></item>
-          <item><description>They can represent non-linear decision boundaries.</description></item>
-          <item><description>They perform integrated feature selection and classification. </description></item>
-          <item><description>They are resilient in the presence of noisy features.</description></item>
-        </list>
-        Fast forest is a random forest implementation.
-        The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction.
-        An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
-        This decision forest classifier consists of an ensemble of decision trees.
-        Generally, ensemble models provide better coverage and accuracy than single decision trees.
-        Each tree in a decision forest outputs a Gaussian distribution.
-        <a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
-        <a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
-        <a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
-      </remarks>
-    </member>
-
-    <member name="LightGBM">
-      <summary>
-        Trains a Light GBM Model.
-      </summary>
-      <remarks>
-        Light GBM is an open source implementation of boosted trees.
-        <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a>
-      </remarks>
-    </member>
-
-    <member name="LBFGS">
-      <summary>
-        Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. 
-        The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
-      </summary>
-      <remarks>
-        If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
-        <para>
-          The optimization technique used for LogisticRegression Classifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS).
-          Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps.
-          But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction,
-          so that it is especially suited for problems with a large number of variables.
-          The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
-        </para>
-        <para>
-          This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations.
-          Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values.
-          This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff.
-          Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis.
-          An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
-        </para>
-          <list type='bullet'>
-            <item>
-              <description>
-              <paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data.
-              It pulls small weights associated features that are relatively unimportant towards 0.
-            </description>
-            </item>
-            <item>
-              <description>
-                <paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero.
-              </description>
-            </item>
-          </list>
-          Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms.
-          The default values of x and y are both 1.
-          An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
-        <para>For more information see:</para>
-        <list type='bullet'>
-          <item><description><a href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</a>.</description></item>
-          <item><description><a href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</a>.</description></item>
-          <item><description><a href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</a>.</description></item>
-          <item><description><a href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</a>.</description></item>
-        </list>
-      </remarks>
-    </member>
-    <example name='LogisticRegressionClassifier'>
-      <example>
-        <code>
-          pipeline.Add(new LogisticRegressionClassifier());
-        </code>
-      </example>
-    </example>
-    <example name='LogisticRegressionBinaryClassifier'>
-      <example>
-        <code>
-          pipeline.Add(new LogisticRegressionBinaryClassifier());
-        </code>
-      </example>
-    </example>
-
-    <member name="KMeans++">
-      <summary>
-        K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified 
-        number of clusters in order to minimize the within-cluster sum of squares.
-      </summary>
-      <remarks>
-        K-means++ improves upon K-means by using the <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=252149'>Yinyang K-Means</a> method for choosing the initial cluster centers.
-        YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).
-        YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
-        It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
-        <para>For more information on K-means, and K-means++ see:</para>
-        <para><a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.</para>
-        <para><a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a></para>
-      </remarks>
-    </member>
-
-    <member name="OGD">
-      <summary>
-        Stochastic gradient descent is an optimization method used to train a wide range of models in machine learning. 
-        In the ML.Net the implementation of OGD, it is for linear regression. 
-      </summary>
-      <remarks>
-        Stochastic gradient descent uses a simple yet efficient iterative technique to fit model coefficients using error gradients for convex loss functions.
-        The OnlineGradientDescentRegressor implements the standard (non-batch) SGD, with a choice of loss functions,
-        and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
-      </remarks>
-    </member>
-
-    <member name="AP">
-      <summary>
-        Averaged Perceptron Binary Classifier. 
-      </summary>
-      <remarks>
-        Perceptron is a classification algorithm that makes its predictions based on a linear function.
-        I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
-        <para>
-          Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
-          The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed.
-          If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
-          the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
-          multiplied by a factor 0 &lt; a &lt;= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate,
-          and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
-        </para>
-        <para>
-          In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored,
-          together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
-          The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
-        </para>
-        <para> For more information see:</para>
-        <para><a href='https://en.wikipedia.org/wiki/Perceptron'>Wikipedia entry for Perceptron</a></para>
-        <para><a href='http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.8200'>Large Margin Classification Using the Perceptron Algorithm</a></para>
-      </remarks>
-    </member>
-    
-    <member name="PoissonRegression">
-      <summary>
-        Trains a Poisson Regression model.  
-      </summary>
-      <remarks>
-        <a href='https://en.wikipedia.org/wiki/Poisson_regression'>Poisson regression</a> is a parameterized regression method.
-        It assumes that the log of the conditional mean of the dependent variable follows a linear function of the dependent variables.
-        Assuming that the dependent variable follows a Poisson distribution, the parameters of the regressor can be estimated by maximizing the likelihood of the obtained observations.
-      </remarks>
-    </member>
-
-    <member name="FastTreeTweedieRegression">
-      <summary>
-        Trains gradient boosted decision trees to fit target values using a Tweedie loss function. 
-        This learner is a generalization of Poisson, compound Poisson, and gamma regression.
-      </summary>
-      <remarks>
-        The Tweedie boosting model follows the mathematics established in <a href="https://arxiv.org/pdf/1508.06378.pdf">
-        Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models.</a> from Yang, Quan, and Zou. 
-        For an introduction to Gradient Boosting, and more information, see:
-        <para><a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a></para>
-        <para><a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a></para>
-      </remarks>
-    </member>
-        
-  </members>
-</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
index 9bd900a94e..aff3b10ca9 100644
--- a/src/Microsoft.ML.FastTree/FastTreeArguments.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
@@ -20,7 +20,7 @@ public interface IFastTreeTrainerFactory : IComponentFactory<ITrainer>
     {
     }
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer
     {
         [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)]
diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
index df05261c3f..bbf7cdb82f 100644
--- a/src/Microsoft.ML.FastTree/FastTreeClassification.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
@@ -100,6 +100,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
+    /// <include file = './XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer :
         BoostingFastTreeTrainerBase<FastTreeBinaryClassificationTrainer.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -345,7 +346,7 @@ public static partial class FastTree
             Desc = FastTreeBinaryClassificationTrainer.Summary,
             UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
             ShortName = FastTreeBinaryClassificationTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
index c196f9c303..fe08d5f39f 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRanking.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
@@ -38,7 +38,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRankingTrainer : BoostingFastTreeTrainerBase<FastTreeRankingTrainer.Arguments, FastTreeRankingPredictor>,
         IHasLabelGains
     {
@@ -1101,7 +1101,7 @@ public static partial class FastTree
             Desc = FastTreeRankingTrainer.Summary,
             UserName = FastTreeRankingTrainer.UserNameValue,
             ShortName = FastTreeRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
index a9f235f4d8..6266aa2fcd 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRegression.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
@@ -31,7 +31,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRegressionTrainer : BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Arguments, FastTreeRegressionPredictor>
     {
         public const string LoadNameValue = "FastTreeRegression";
@@ -453,7 +453,7 @@ public static partial class FastTree
             Desc = FastTreeRegressionTrainer.Summary,
             UserName = FastTreeRegressionTrainer.UserNameValue,
             ShortName = FastTreeRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
index cf8c1fd8b2..8fd6596c65 100644
--- a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
@@ -30,7 +30,7 @@ namespace Microsoft.ML.Runtime.FastTree
     // The Tweedie boosting model follows the mathematics established in:
     // Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
     // https://arxiv.org/pdf/1508.06378.pdf
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
     public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase<FastTreeTweedieTrainer.Arguments, FastTreeTweediePredictor>
     {
         public const string LoadNameValue = "FastTreeTweedieRegression";
@@ -461,7 +461,7 @@ public static partial class FastTree
             Desc = FastTreeTweedieTrainer.Summary,
             UserName = FastTreeTweedieTrainer.UserNameValue,
             ShortName = FastTreeTweedieTrainer.ShortName,
-            XmlInclude = new [] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
+            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
index 43ed8b0561..e1be1b5871 100644
--- a/src/Microsoft.ML.FastTree/RandomForestClassification.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
@@ -106,7 +106,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         }
     }
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestClassification :
         RandomForestTrainerBase<FastForestClassification.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -213,7 +213,7 @@ public static partial class FastForest
             Desc = FastForestClassification.Summary,
             UserName = FastForestClassification.UserNameValue,
             ShortName = FastForestClassification.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
index 8483d99f01..be7e3f370f 100644
--- a/src/Microsoft.ML.FastTree/RandomForestRegression.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
@@ -137,7 +137,7 @@ public ISchemaBindableMapper CreateMapper(Double[] quantiles)
         }
     }
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestRegression : RandomForestTrainerBase<FastForestRegression.Arguments, FastForestRegressionPredictor>
     {
         public sealed class Arguments : FastForestArgumentsBase
@@ -285,7 +285,7 @@ public static partial class FastForest
             Desc = FastForestRegression.Summary,
             UserName = FastForestRegression.LoadNameValue,
             ShortName = FastForestRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/XMLDoc.xml b/src/Microsoft.ML.FastTree/XMLDoc.xml
new file mode 100644
index 0000000000..f68cb4de65
--- /dev/null
+++ b/src/Microsoft.ML.FastTree/XMLDoc.xml
@@ -0,0 +1,78 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+
+    <member name="FastTree">
+      <summary>
+        Trains gradient boosted decision trees to the LambdaRank quasi-gradient. 
+      </summary>
+      <remarks>
+        <para>
+          FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
+          Gradient boosting is a machine learning technique for regression problems.
+          It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next.
+          So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
+        </para>
+        <para>
+          MART learns an ensemble of regression trees, which is a decision tree with scalar values in its leaves.
+          A decision (or regression) tree is a binary tree-like flow chart, where at each interior node one decides which of the two child nodes to continue to based on one of the feature values from the input.
+          At each leaf node, a value is returned. In the interior nodes, the decision is based on the test 'x &lt;= v' where x is the value of the feature in the input sample and v is one of the possible values of this feature.
+          The functions that can be produced by a regression tree are all the piece-wise constant functions.
+        </para>
+        <para>
+          The ensemble of trees is produced by computing, in each step, a regression tree that approximates the gradient of the loss function, and adding it to the previous tree with coefficients that minimize the loss of the new tree.
+          The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
+        </para>
+        <list type='bullet'>
+          <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
+          <item>In case of a regression problem, the output is the predicted value of the function.</item>
+          <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
+        </list>
+        <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
+        <a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
+      </remarks>
+    </member>
+    
+    <member name="FastForest">
+      <summary>
+        Trains a random forest to fit target values using least-squares.
+      </summary>
+      <remarks>
+        Decision trees are non-parametric models that perform a sequence of simple tests on inputs.
+        This decision procedure maps them to outputs found in the training dataset whose inputs were similar to the instance being processed.
+        A decision is made at each node of the binary tree data structure based on a measure of similarity that maps each instance recursively through the branches of the tree until the appropriate leaf node is reached and the output decision returned.
+        <para>Decision trees have several advantages:</para>
+        <list type='bullet'>
+          <item><description>They are efficient in both computation and memory usage during training and prediction. </description></item>
+          <item><description>They can represent non-linear decision boundaries.</description></item>
+          <item><description>They perform integrated feature selection and classification. </description></item>
+          <item><description>They are resilient in the presence of noisy features.</description></item>
+        </list>
+        Fast forest is a random forest implementation.
+        The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction.
+        An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
+        This decision forest classifier consists of an ensemble of decision trees.
+        Generally, ensemble models provide better coverage and accuracy than single decision trees.
+        Each tree in a decision forest outputs a Gaussian distribution.
+        <a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
+        <a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
+        <a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
+      </remarks>
+    </member>
+
+    <member name="FastTreeTweedieRegression">
+      <summary>
+        Trains gradient boosted decision trees to fit target values using a Tweedie loss function. 
+        This learner is a generalization of Poisson, compound Poisson, and gamma regression.
+      </summary>
+      <remarks>
+        The Tweedie boosting model follows the mathematics established in <a href="https://arxiv.org/pdf/1508.06378.pdf">
+        Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models.</a> from Yang, Quan, and Zou. 
+        For an introduction to Gradient Boosting, and more information, see:
+        <para><a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a></para>
+        <para><a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a></para>
+      </remarks>
+    </member>
+        
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
index 669a521e72..74a6a1874b 100644
--- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
+++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
@@ -28,7 +28,7 @@
 
 namespace Microsoft.ML.Runtime.KMeans
 {
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="KMeans++"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="KMeans++"]/*' />
     public class KMeansPlusPlusTrainer : TrainerBase<RoleMappedData, KMeansPredictor>
     {
         public const string LoadNameValue = "KMeansPlusPlus";
@@ -213,7 +213,7 @@ private static int ComputeNumThreads(IHost host, int? argNumThreads)
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""KMeans++""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/XMLDoc.xml' path='docs/members/member[@name=""KMeans++""]/*' />" })]
         public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.KMeansClustering/XMLDoc.xml b/src/Microsoft.ML.KMeansClustering/XMLDoc.xml
new file mode 100644
index 0000000000..affaeabf98
--- /dev/null
+++ b/src/Microsoft.ML.KMeansClustering/XMLDoc.xml
@@ -0,0 +1,22 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+
+    <member name="KMeans++">
+      <summary>
+        K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified 
+        number of clusters in order to minimize the within-cluster sum of squares.
+      </summary>
+      <remarks>
+        K-means++ improves upon K-means by using the <a href='http://research.microsoft.com/apps/pubs/default.aspx?id=252149'>Yinyang K-Means</a> method for choosing the initial cluster centers.
+        YYK-Means accelerates K-Means up to an order of magnitude while producing exactly the same clustering results (modulo floating point precision issues).
+        YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
+        It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
+        <para>For more information on K-means, and K-means++ see:</para>
+        <para><a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.</para>
+        <para><a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a></para>
+      </remarks>
+    </member>
+   
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
index 45429d6c56..fc6c345e30 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
@@ -81,7 +81,7 @@ public static IPredictorProducing<float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase<float, IPredictorWithFeatureWeights<float>>
     {
         internal const string UserName = "LightGBM Binary Classifier";
@@ -134,7 +134,7 @@ public static partial class LightGbm
             Desc = LightGbmBinaryTrainer.Summary,
             UserName = LightGbmBinaryTrainer.UserName, 
             ShortName = LightGbmBinaryTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
index 5bc2838aea..6a3ea8183f 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
@@ -18,7 +18,7 @@
 namespace Microsoft.ML.Runtime.LightGBM
 {
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmMulticlassTrainer : LightGbmTrainerBase<VBuffer<float>, OvaPredictor>
     {
         public const string Summary = "LightGBM Multi Class Classifier";
@@ -185,7 +185,7 @@ public static partial class LightGbm
             Desc = "Train a LightGBM multi class model.", 
             UserName = LightGbmMulticlassTrainer.Summary, 
             ShortName = LightGbmMulticlassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
index 1a61ab4907..82d33fa092 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
@@ -71,7 +71,7 @@ public static LightGbmRankingPredictor Create(IHostEnvironment env, ModelLoadCon
         public override PredictionKind PredictionKind { get { return PredictionKind.Ranking; } }
     }
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, LightGbmRankingPredictor>
     {
         public const string UserName = "LightGBM Ranking";
@@ -132,7 +132,7 @@ public static partial class LightGbm
             Desc = "Train a LightGBM ranking model.", 
             UserName = LightGbmRankingTrainer.UserName, 
             ShortName = LightGbmRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
index 4b2d0f2a3b..08c442cb47 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
@@ -20,7 +20,7 @@
 
 namespace Microsoft.ML.Runtime.LightGBM
 {
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRegressionPredictor : FastTreePredictionWrapper
     {
         public const string LoaderSignature = "LightGBMRegressionExec";
@@ -124,7 +124,7 @@ public static partial class LightGbm
             Desc = LightGbmRegressorTrainer.Summary, 
             UserName = LightGbmRegressorTrainer.UserNameValue, 
             ShortName = LightGbmRegressorTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/XMLDoc.xml b/src/Microsoft.ML.LightGBM/XMLDoc.xml
new file mode 100644
index 0000000000..4d53265ae3
--- /dev/null
+++ b/src/Microsoft.ML.LightGBM/XMLDoc.xml
@@ -0,0 +1,16 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+    
+    <member name="LightGBM">
+      <summary>
+        Trains a Light GBM Model.
+      </summary>
+      <remarks>
+        Light GBM is an open source implementation of boosted trees.
+        <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a>
+      </remarks>
+    </member>
+
+  </members>
+</docs>
\ No newline at end of file
diff --git a/docs/code/xmlIncludes/Pca.xml b/src/Microsoft.ML.PCA/Pca.xml
similarity index 100%
rename from docs/code/xmlIncludes/Pca.xml
rename to src/Microsoft.ML.PCA/Pca.xml
diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index ff8e74f122..be9314887f 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -288,7 +288,7 @@ private static void PostProcess(VBuffer<Float>[] y, Float[] sigma, Float[] z, in
             Desc = "Train an PCA Anomaly model.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -308,7 +308,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
     // - - If the error is close to 0, the instance is considered normal (non-anomaly).
     // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
     // REVIEW: Include the above detail in the XML documentation file. 
-    /// <include file='../../docs/code/xmlIncludes/PCA.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='./PCA.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaPredictor : PredictorBase<Float>,
         IValueMapper,
         ICanGetSummaryAsIDataView,
diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs
index 9dd48c600e..34e3aedb9d 100644
--- a/src/Microsoft.ML.PCA/PcaTransform.cs
+++ b/src/Microsoft.ML.PCA/PcaTransform.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='./Pca.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
@@ -541,7 +541,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer<Float>
             Desc = Summary,
             UserName = UserName, 
             ShortName = ShortName, 
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input);
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
index dacbe1a33f..0635362dc8 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.FactorizationMachine
      [2] http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
      [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
     */
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
     public sealed class FieldAwareFactorizationMachineTrainer : TrainerBase<RoleMappedData, FieldAwareFactorizationMachinePredictor>,
         IIncrementalTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>, IValidatingTrainer<RoleMappedData>,
         IIncrementalValidatingTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>
@@ -407,7 +407,7 @@ public override FieldAwareFactorizationMachinePredictor CreatePredictor()
             Desc = Summary,
             UserName = UserName,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml' path='docs/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml
new file mode 100644
index 0000000000..2e72b2ea9f
--- /dev/null
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml
@@ -0,0 +1,42 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+    
+    <member name="FieldAwareFactorizationMachineBinaryClassifier">
+      <summary>
+        Train a field-aware factorization machine for binary classification using ADAGRAD (an advanced stochastic gradient method). 
+      </summary>
+      <remarks>
+        Field Aware Factorization Machines use, in addition to the input variables, factorized parameters to model the interaction between pairs of variables.
+        The algorithm is particularly useful for high dimensional datasets which can be very sparse (e.g. click-prediction for advertising systems).
+        An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
+        <para> For a general idea of what Field-aware Factorization Machines are see: <a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
+        </para>
+        <para>See references below for more details. 
+        This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
+        </para>
+          <list >
+            <item>
+              <description>
+                [1] <a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
+            <item>
+              <description>
+                [2] <a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
+              </description>
+            </item>
+            <item>
+              <description>
+                [3] <a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a>
+              </description>
+            </item>
+          </list>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new FieldAwareFactorizationMachineBinaryClassifier(){ LearningRate = 0.5f, Iter=2 });
+        </code>
+      </example>
+    </member>
+        
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 504bdca5f6..a92ffc081a 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -30,8 +30,8 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = Microsoft.ML.Runtime.Learners.OlsLinearRegressionTrainer.Mkl;
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
     public sealed partial class LogisticRegression : LbfgsTrainerBase<Float, ParameterMixingCalibratedPredictor>
     {
         public const string LoadNameValue = "LogisticRegression";
@@ -392,8 +392,8 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
                             
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 27e86682bd..f608821708 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -36,8 +36,8 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file = '../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
-    /// <include file = '../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+    /// <include file = './XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file = './XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
     public sealed class MulticlassLogisticRegression : LbfgsTrainerBase<VBuffer<Float>, MulticlassLogisticRegressionPredictor>
     {
         public const string LoadNameValue = "MultiClassLogisticRegression";
@@ -966,8 +966,8 @@ public partial class LogisticRegression
             Desc = Summary,
             UserName = MulticlassLogisticRegression.UserNameValue,
             ShortName = MulticlassLogisticRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml
new file mode 100644
index 0000000000..5ac68e2fc0
--- /dev/null
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml
@@ -0,0 +1,67 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+
+    <member name="LBFGS">
+      <summary>
+        Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. 
+        The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
+      </summary>
+      <remarks>
+        If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        <para>
+          The optimization technique used for LogisticRegression Classifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS).
+          Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps.
+          But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction,
+          so that it is especially suited for problems with a large number of variables.
+          The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+        </para>
+        <para>
+          This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations.
+          Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values.
+          This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff.
+          Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis.
+          An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+        </para>
+          <list type='bullet'>
+            <item>
+              <description>
+              <paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data.
+              It pulls small weights associated features that are relatively unimportant towards 0.
+            </description>
+            </item>
+            <item>
+              <description>
+                <paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero.
+              </description>
+            </item>
+          </list>
+          Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms.
+          The default values of x and y are both 1.
+          An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.
+        <para>For more information see:</para>
+        <list type='bullet'>
+          <item><description><a href='http://research.microsoft.com/apps/pubs/default.aspx?id=78900'>Scalable Training of L1-Regularized Log-Linear Models</a>.</description></item>
+          <item><description><a href='https://msdn.microsoft.com/en-us/magazine/dn904675.aspx'>Test Run - L1 and L2 Regularization for Machine Learning</a>.</description></item>
+          <item><description><a href='http://en.wikipedia.org/wiki/L-BFGS'>Wikipedia: L-BFGS</a>.</description></item>
+          <item><description><a href='http://en.wikipedia.org/wiki/Logistic_regression'>Wikipedia: Logistic regression</a>.</description></item>
+        </list>
+      </remarks>
+    </member>
+    <example name='LogisticRegressionClassifier'>
+      <example>
+        <code>
+          pipeline.Add(new LogisticRegressionClassifier());
+        </code>
+      </example>
+    </example>
+    <example name='LogisticRegressionBinaryClassifier'>
+      <example>
+        <code>
+          pipeline.Add(new LogisticRegressionBinaryClassifier());
+        </code>
+      </example>
+    </example>
+   
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index 50ddc7fadb..d4910c142c 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -28,7 +28,7 @@ namespace Microsoft.ML.Runtime.Learners
     //     - Loss function. By default, hinge loss (aka max-margin avgd perceptron)
     //     - Feature normalization. By default, rescaling between min and max values for every feature
     //     - Prediction calibration to produce probabilities. Off by default, if on, uses exponential (aka Platt) calibration.
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="AP"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="AP"]/*' />
     public sealed class AveragedPerceptronTrainer :
         AveragedLinearTrainer<AveragedPerceptronTrainer.Arguments, LinearBinaryPredictor>
     {
@@ -94,7 +94,7 @@ public override LinearBinaryPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""AP""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name=""AP""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
index c67f10e2c0..076f247e1a 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
@@ -27,7 +27,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TPredictor = LinearRegressionPredictor;
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="OGD"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="OGD"]/*' />
     public sealed class OnlineGradientDescentTrainer : AveragedLinearTrainer<OnlineGradientDescentTrainer.Arguments, TPredictor>
     {
         internal const string LoadNameValue = "OnlineGradientDescent";
@@ -94,7 +94,7 @@ public override TPredictor CreatePredictor()
             Desc = "Train a Online gradient descent perceptron.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""OGD""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name=""OGD""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml
new file mode 100644
index 0000000000..1ab7647c4f
--- /dev/null
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml
@@ -0,0 +1,44 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+    
+    <member name="OGD">
+      <summary>
+        Stochastic gradient descent is an optimization method used to train a wide range of models in machine learning. 
+        In the ML.Net the implementation of OGD, it is for linear regression. 
+      </summary>
+      <remarks>
+        Stochastic gradient descent uses a simple yet efficient iterative technique to fit model coefficients using error gradients for convex loss functions.
+        The OnlineGradientDescentRegressor implements the standard (non-batch) SGD, with a choice of loss functions,
+        and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
+      </remarks>
+    </member>
+
+    <member name="AP">
+      <summary>
+        Averaged Perceptron Binary Classifier. 
+      </summary>
+      <remarks>
+        Perceptron is a classification algorithm that makes its predictions based on a linear function.
+        I.e., for an instance with feature values f0, f1,..., f_D-1, , the prediction is given by the sign of sigma[0,D-1] ( w_i * f_i), where w_0, w_1,...,w_D-1 are the weights computed by the algorithm.
+        <para>
+          Perceptron is an online algorithm, i.e., it processes the instances in the training set one at a time.
+          The weights are initialized to be 0, or some random values. Then, for each example in the training set, the value of sigma[0, D-1] (w_i * f_i) is computed.
+          If this value has the same sign as the label of the current example, the weights remain the same. If they have opposite signs,
+          the weights vector is updated by either subtracting or adding (if the label is negative or positive, respectively) the feature vector of the current example,
+          multiplied by a factor 0 &lt; a &lt;= 1, called the learning rate. In a generalization of this algorithm, the weights are updated by adding the feature vector multiplied by the learning rate,
+          and by the gradient of some loss function (in the specific case described above, the loss is hinge-loss, whose gradient is 1 when it is non-zero).
+        </para>
+        <para>
+          In Averaged Perceptron (AKA voted-perceptron), the weight vectors are stored,
+          together with a weight that counts the number of iterations it survived (this is equivalent to storing the weight vector after every iteration, regardless of whether it was updated or not).
+          The prediction is then calculated by taking the weighted average of all the sums sigma[0, D-1] (w_i * f_i) or the different weight vectors.
+        </para>
+        <para> For more information see:</para>
+        <para><a href='https://en.wikipedia.org/wiki/Perceptron'>Wikipedia entry for Perceptron</a></para>
+        <para><a href='http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.8200'>Large Margin Classification Using the Perceptron Algorithm</a></para>
+      </remarks>
+    </member>
+
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index 4d407fbfa7..970a1e62b2 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
     public sealed class PoissonRegression : LbfgsTrainerBase<Float, PoissonRegressionPredictor>
     {
         internal const string LoadNameValue = "PoissonRegression";
@@ -129,7 +129,7 @@ protected override void ProcessPriorDistribution(Float label, Float weight)
             Desc = "Train an Poisson regression model.", 
             UserName = UserNameValue, 
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""PoissonRegression""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml' path='docs/members/member[@name=""PoissonRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml
new file mode 100644
index 0000000000..4d2aeec579
--- /dev/null
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml
@@ -0,0 +1,17 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+   
+    <member name="PoissonRegression">
+      <summary>
+        Trains a Poisson Regression model.  
+      </summary>
+      <remarks>
+        <a href='https://en.wikipedia.org/wiki/Poisson_regression'>Poisson regression</a> is a parameterized regression method.
+        It assumes that the log of the conditional mean of the dependent variable follows a linear function of the dependent variables.
+        Assuming that the dependent variable follows a Poisson distribution, the parameters of the regressor can be estimated by maximizing the likelihood of the obtained observations.
+      </remarks>
+    </member>
+   
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
index 08a13ac7a1..6eb76ea2be 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.Learners
     using TVectorPredictor = IPredictorProducing<VBuffer<Float>>;
 
     // SDCA linear multiclass trainer.
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
     public class SdcaMultiClassTrainer : SdcaTrainerBase<TVectorPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAMC";
@@ -389,7 +389,7 @@ public static partial class Sdca
             Desc = SdcaMultiClassTrainer.Summary,
             UserName = SdcaMultiClassTrainer.UserNameValue,
             ShortName = SdcaMultiClassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
index cdd4ce6411..cfd1fb3682 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
@@ -25,7 +25,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TScalarPredictor = IPredictorWithFeatureWeights<Float>;
 
-    /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
     public sealed class SdcaRegressionTrainer : SdcaTrainerBase<IPredictor>, ITrainer<RoleMappedData, TScalarPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAR";
@@ -136,7 +136,7 @@ public static partial class Sdca
             Desc = SdcaRegressionTrainer.Summary,
             UserName = SdcaRegressionTrainer.UserNameValue,
             ShortName = SdcaRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/XMLDoc.xml
new file mode 100644
index 0000000000..0b4336a96e
--- /dev/null
+++ b/src/Microsoft.ML.StandardLearners/Standard/XMLDoc.xml
@@ -0,0 +1,30 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+    
+    <member name="SDCA">
+      <summary>
+        Train an SDCA linear model.
+      </summary>
+      <remarks>
+        This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+        The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
+        <para>
+          Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
+          Several choices of loss functions are also provided.
+          The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.
+        </para>
+        <para>
+          Note that SDCA is a stochastic and streaming optimization algorithm.
+          The results depends on the order of the training data. For reproducible results, it is recommended that one sets 'Shuffle' to
+          False and 'NumThreads' to 1.
+          Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
+          In general, the larger the 'L2Const', the faster SDCA converges.
+        </para>
+        <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
+        <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
+      </remarks>
+    </member>
+ 
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
index dace70510a..93f1729a6b 100644
--- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
@@ -19,7 +19,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
     public static class CategoricalHashTransform
     {
         public const int NumBitsLim = 31; // can't convert 31-bit hashes to indicator vectors, so max is 30
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index d2f7b5a154..553dc2b2f8 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -21,7 +21,7 @@
 [assembly: LoadableClass(typeof(void), typeof(Categorical), null, typeof(SignatureEntryPointModule), "Categorical")]
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
     public static class CategoricalTransform
     {
         public enum OutputKind : byte
@@ -246,7 +246,7 @@ public static class Categorical
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", 
             Desc = CategoricalTransform.Summary,
             UserName = CategoricalTransform.UserName, 
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, CategoricalTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -261,7 +261,7 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", 
             Desc = CategoricalHashTransform.Summary,
             UserName = CategoricalHashTransform.UserName ,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, CategoricalHashTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/docs/code/xmlIncludes/Transforms.xml b/src/Microsoft.ML.Transforms/XMLDoc.xml
similarity index 100%
rename from docs/code/xmlIncludes/Transforms.xml
rename to src/Microsoft.ML.Transforms/XMLDoc.xml
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index d82ac58d4f..7b995dc511 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -4095,7 +4095,7 @@ public sealed class Output
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="AP"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name="AP"]/*' />
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4599,7 +4599,7 @@ public enum Bundle : byte
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4890,7 +4890,7 @@ public FastForestBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5177,7 +5177,7 @@ public enum BoostedTreeArgsOptimizationAlgorithmType
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5566,7 +5566,7 @@ public FastTreeBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5990,7 +5990,7 @@ public FastTreeRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6374,7 +6374,7 @@ public FastTreeRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
         public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6763,7 +6763,7 @@ public FastTreeTweedieRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
         public sealed partial class FieldAwareFactorizationMachineBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7190,7 +7190,7 @@ public enum KMeansPlusPlusTrainerInitAlgorithm
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="KMeans++"]/*' />
+        /// <include file='../Microsoft.ML.KMeansClustering/XMLDoc.xml' path='docs/members/member[@name="KMeans++"]/*' />
         public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7306,7 +7306,7 @@ public enum LightGbmArgumentsEvalMetricType
         }
 
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7509,7 +7509,7 @@ public LightGbmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7712,7 +7712,7 @@ public LightGbmClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7915,7 +7915,7 @@ public LightGbmRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8253,8 +8253,8 @@ public LinearSvmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8402,8 +8402,8 @@ public LogisticRegressionBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="LBFGS"]/*' />
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8624,7 +8624,7 @@ public NaiveBayesClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="OGD"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name="OGD"]/*' />
         public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8777,7 +8777,7 @@ public OnlineGradientDescentRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
         public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8871,7 +8871,7 @@ public PcaAnomalyDetectorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
         public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9153,7 +9153,7 @@ public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9276,7 +9276,7 @@ public StochasticDualCoordinateAscentClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9879,7 +9879,7 @@ public sealed partial class CategoricalHashTransformColumn : OneToOneColumn<Cate
 
         }
 
-        /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
         public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10052,7 +10052,7 @@ public sealed partial class CategoricalTransformColumn : OneToOneColumn<Categori
 
         }
 
-        /// <include file='../../docs/code/xmlIncludes/Transforms.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
         public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13983,7 +13983,7 @@ public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColu
 
         }
 
-        /// <include file='../../docs/code/xmlIncludes/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
         public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 

From 6854c9a98d83453d4afbd33eaf71e26ecdb5a0b0 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 9 Jul 2018 15:51:24 -0700
Subject: [PATCH 05/14] Some OS get picky about casing.

---
 src/Microsoft.ML.PCA/PcaTrainer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index be9314887f..7e15d9baf7 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -308,7 +308,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
     // - - If the error is close to 0, the instance is considered normal (non-anomaly).
     // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
     // REVIEW: Include the above detail in the XML documentation file. 
-    /// <include file='./PCA.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='./Pca.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaPredictor : PredictorBase<Float>,
         IValueMapper,
         ICanGetSummaryAsIDataView,

From 5f52db81c24e367583687b08f04e70725f4a1d65 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Tue, 10 Jul 2018 13:51:46 -0700
Subject: [PATCH 06/14] file name should be vanilla

---
 .../FastTreeArguments.cs                      |  2 +-
 .../FastTreeClassification.cs                 |  4 +-
 src/Microsoft.ML.FastTree/FastTreeRanking.cs  |  4 +-
 .../FastTreeRegression.cs                     |  4 +-
 src/Microsoft.ML.FastTree/FastTreeTweedie.cs  |  4 +-
 .../RandomForestClassification.cs             |  4 +-
 .../RandomForestRegression.cs                 |  4 +-
 .../{XMLDoc.xml => doc.xml}                   |  2 +-
 .../KMeansPlusPlusTrainer.cs                  |  4 +-
 .../{XMLDoc.xml => doc.xml}                   |  0
 .../LightGbmBinaryTrainer.cs                  |  4 +-
 .../LightGbmMulticlassTrainer.cs              |  4 +-
 .../LightGbmRankingTrainer.cs                 |  4 +-
 .../LightGbmRegressionTrainer.cs              |  4 +-
 .../{XMLDoc.xml => doc.xml}                   |  0
 src/Microsoft.ML.PCA/PcaTrainer.cs            |  4 +-
 src/Microsoft.ML.PCA/PcaTransform.cs          |  4 +-
 src/Microsoft.ML.PCA/{Pca.xml => doc.xml}     |  0
 .../FactorizationMachineTrainer.cs            |  4 +-
 .../{XMLDoc.xml => doc.xml}                   |  0
 .../LogisticRegression/LogisticRegression.cs  |  8 +--
 .../MulticlassLogisticRegression.cs           |  8 +--
 .../{XMLDoc.xml => doc.xml}                   |  0
 .../Standard/Online/AveragedPerceptron.cs     |  4 +-
 .../Standard/Online/OnlineGradientDescent.cs  |  4 +-
 .../Standard/Online/{XMLDoc.xml => doc.xml}   |  0
 .../PoissonRegression/PoissonRegression.cs    |  4 +-
 .../PoissonRegression/{XMLDoc.xml => doc.xml} |  0
 .../Standard/SdcaMultiClass.cs                |  4 +-
 .../Standard/SdcaRegression.cs                |  4 +-
 .../Standard/{XMLDoc.xml => doc.xml}          |  0
 .../CategoricalHashTransform.cs               |  2 +-
 .../CategoricalTransform.cs                   |  6 +--
 .../{XMLDoc.xml => doc.xml}                   |  0
 src/Microsoft.ML/CSharpApi.cs                 | 50 +++++++++----------
 35 files changed, 77 insertions(+), 77 deletions(-)
 rename src/Microsoft.ML.FastTree/{XMLDoc.xml => doc.xml} (97%)
 rename src/Microsoft.ML.KMeansClustering/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.LightGBM/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.PCA/{Pca.xml => doc.xml} (100%)
 rename src/Microsoft.ML.StandardLearners/FactorizationMachine/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.StandardLearners/Standard/Online/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.StandardLearners/Standard/{XMLDoc.xml => doc.xml} (100%)
 rename src/Microsoft.ML.Transforms/{XMLDoc.xml => doc.xml} (100%)

diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
index aff3b10ca9..e6274e3155 100644
--- a/src/Microsoft.ML.FastTree/FastTreeArguments.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
@@ -20,7 +20,7 @@ public interface IFastTreeTrainerFactory : IComponentFactory<ITrainer>
     {
     }
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer
     {
         [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)]
diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
index bbf7cdb82f..18f61e6dbe 100644
--- a/src/Microsoft.ML.FastTree/FastTreeClassification.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
@@ -100,7 +100,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
-    /// <include file = './XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file = './doc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer :
         BoostingFastTreeTrainerBase<FastTreeBinaryClassificationTrainer.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -346,7 +346,7 @@ public static partial class FastTree
             Desc = FastTreeBinaryClassificationTrainer.Summary,
             UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
             ShortName = FastTreeBinaryClassificationTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
index fe08d5f39f..6eabca8c78 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRanking.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
@@ -38,7 +38,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRankingTrainer : BoostingFastTreeTrainerBase<FastTreeRankingTrainer.Arguments, FastTreeRankingPredictor>,
         IHasLabelGains
     {
@@ -1101,7 +1101,7 @@ public static partial class FastTree
             Desc = FastTreeRankingTrainer.Summary,
             UserName = FastTreeRankingTrainer.UserNameValue,
             ShortName = FastTreeRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
index 6266aa2fcd..308437440a 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRegression.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
@@ -31,7 +31,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRegressionTrainer : BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Arguments, FastTreeRegressionPredictor>
     {
         public const string LoadNameValue = "FastTreeRegression";
@@ -453,7 +453,7 @@ public static partial class FastTree
             Desc = FastTreeRegressionTrainer.Summary,
             UserName = FastTreeRegressionTrainer.UserNameValue,
             ShortName = FastTreeRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
index 8fd6596c65..b43c499a44 100644
--- a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
@@ -30,7 +30,7 @@ namespace Microsoft.ML.Runtime.FastTree
     // The Tweedie boosting model follows the mathematics established in:
     // Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
     // https://arxiv.org/pdf/1508.06378.pdf
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
     public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase<FastTreeTweedieTrainer.Arguments, FastTreeTweediePredictor>
     {
         public const string LoadNameValue = "FastTreeTweedieRegression";
@@ -461,7 +461,7 @@ public static partial class FastTree
             Desc = FastTreeTweedieTrainer.Summary,
             UserName = FastTreeTweedieTrainer.UserNameValue,
             ShortName = FastTreeTweedieTrainer.ShortName,
-            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
+            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
index e1be1b5871..8cd62ceb77 100644
--- a/src/Microsoft.ML.FastTree/RandomForestClassification.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
@@ -106,7 +106,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         }
     }
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestClassification :
         RandomForestTrainerBase<FastForestClassification.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -213,7 +213,7 @@ public static partial class FastForest
             Desc = FastForestClassification.Summary,
             UserName = FastForestClassification.UserNameValue,
             ShortName = FastForestClassification.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
index be7e3f370f..f501037df3 100644
--- a/src/Microsoft.ML.FastTree/RandomForestRegression.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
@@ -137,7 +137,7 @@ public ISchemaBindableMapper CreateMapper(Double[] quantiles)
         }
     }
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestRegression : RandomForestTrainerBase<FastForestRegression.Arguments, FastForestRegressionPredictor>
     {
         public sealed class Arguments : FastForestArgumentsBase
@@ -285,7 +285,7 @@ public static partial class FastForest
             Desc = FastForestRegression.Summary,
             UserName = FastForestRegression.LoadNameValue,
             ShortName = FastForestRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/XMLDoc.xml b/src/Microsoft.ML.FastTree/doc.xml
similarity index 97%
rename from src/Microsoft.ML.FastTree/XMLDoc.xml
rename to src/Microsoft.ML.FastTree/doc.xml
index f68cb4de65..36f0b41f24 100644
--- a/src/Microsoft.ML.FastTree/XMLDoc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -8,7 +8,7 @@
       </summary>
       <remarks>
         <para>
-          FastTrees is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
+          FastTree is an efficient implementation of the <a href='https://arxiv.org/abs/1505.01866'>MART</a> gradient boosting algorithm.
           Gradient boosting is a machine learning technique for regression problems.
           It builds each regression tree in a step-wise fashion, using a predefined loss function to measure the error for each step and corrects for it in the next.
           So this prediction model is actually an ensemble of weaker prediction models. In regression problems, boosting builds a series of of such trees in a step-wise fashion and then selects the optimal tree using an arbitrary differentiable loss function.
diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
index 74a6a1874b..dce7be48d2 100644
--- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
+++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
@@ -28,7 +28,7 @@
 
 namespace Microsoft.ML.Runtime.KMeans
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="KMeans++"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="KMeans++"]/*' />
     public class KMeansPlusPlusTrainer : TrainerBase<RoleMappedData, KMeansPredictor>
     {
         public const string LoadNameValue = "KMeansPlusPlus";
@@ -213,7 +213,7 @@ private static int ComputeNumThreads(IHost host, int? argNumThreads)
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/XMLDoc.xml' path='docs/members/member[@name=""KMeans++""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/doc.xml' path='docs/members/member[@name=""KMeans++""]/*' />" })]
         public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.KMeansClustering/XMLDoc.xml b/src/Microsoft.ML.KMeansClustering/doc.xml
similarity index 100%
rename from src/Microsoft.ML.KMeansClustering/XMLDoc.xml
rename to src/Microsoft.ML.KMeansClustering/doc.xml
diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
index fc6c345e30..54cd523e72 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
@@ -81,7 +81,7 @@ public static IPredictorProducing<float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase<float, IPredictorWithFeatureWeights<float>>
     {
         internal const string UserName = "LightGBM Binary Classifier";
@@ -134,7 +134,7 @@ public static partial class LightGbm
             Desc = LightGbmBinaryTrainer.Summary,
             UserName = LightGbmBinaryTrainer.UserName, 
             ShortName = LightGbmBinaryTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
index 6a3ea8183f..2a84bad0e8 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
@@ -18,7 +18,7 @@
 namespace Microsoft.ML.Runtime.LightGBM
 {
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmMulticlassTrainer : LightGbmTrainerBase<VBuffer<float>, OvaPredictor>
     {
         public const string Summary = "LightGBM Multi Class Classifier";
@@ -185,7 +185,7 @@ public static partial class LightGbm
             Desc = "Train a LightGBM multi class model.", 
             UserName = LightGbmMulticlassTrainer.Summary, 
             ShortName = LightGbmMulticlassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
index 82d33fa092..4a1d1634a8 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
@@ -71,7 +71,7 @@ public static LightGbmRankingPredictor Create(IHostEnvironment env, ModelLoadCon
         public override PredictionKind PredictionKind { get { return PredictionKind.Ranking; } }
     }
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, LightGbmRankingPredictor>
     {
         public const string UserName = "LightGBM Ranking";
@@ -132,7 +132,7 @@ public static partial class LightGbm
             Desc = "Train a LightGBM ranking model.", 
             UserName = LightGbmRankingTrainer.UserName, 
             ShortName = LightGbmRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
index 08c442cb47..6ae3da792a 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
@@ -20,7 +20,7 @@
 
 namespace Microsoft.ML.Runtime.LightGBM
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRegressionPredictor : FastTreePredictionWrapper
     {
         public const string LoaderSignature = "LightGBMRegressionExec";
@@ -124,7 +124,7 @@ public static partial class LightGbm
             Desc = LightGbmRegressorTrainer.Summary, 
             UserName = LightGbmRegressorTrainer.UserNameValue, 
             ShortName = LightGbmRegressorTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/XMLDoc.xml b/src/Microsoft.ML.LightGBM/doc.xml
similarity index 100%
rename from src/Microsoft.ML.LightGBM/XMLDoc.xml
rename to src/Microsoft.ML.LightGBM/doc.xml
diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index 7e15d9baf7..23e7351a86 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -288,7 +288,7 @@ private static void PostProcess(VBuffer<Float>[] y, Float[] sigma, Float[] z, in
             Desc = "Train an PCA Anomaly model.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -308,7 +308,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
     // - - If the error is close to 0, the instance is considered normal (non-anomaly).
     // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
     // REVIEW: Include the above detail in the XML documentation file. 
-    /// <include file='./Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaPredictor : PredictorBase<Float>,
         IValueMapper,
         ICanGetSummaryAsIDataView,
diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs
index 34e3aedb9d..0807abc5ed 100644
--- a/src/Microsoft.ML.PCA/PcaTransform.cs
+++ b/src/Microsoft.ML.PCA/PcaTransform.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='./Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="PCA"]/*' />
     public sealed class PcaTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
@@ -541,7 +541,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer<Float>
             Desc = Summary,
             UserName = UserName, 
             ShortName = ShortName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input);
diff --git a/src/Microsoft.ML.PCA/Pca.xml b/src/Microsoft.ML.PCA/doc.xml
similarity index 100%
rename from src/Microsoft.ML.PCA/Pca.xml
rename to src/Microsoft.ML.PCA/doc.xml
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
index 0635362dc8..b967bd9f95 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.FactorizationMachine
      [2] http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
      [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
     */
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
     public sealed class FieldAwareFactorizationMachineTrainer : TrainerBase<RoleMappedData, FieldAwareFactorizationMachinePredictor>,
         IIncrementalTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>, IValidatingTrainer<RoleMappedData>,
         IIncrementalValidatingTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>
@@ -407,7 +407,7 @@ public override FieldAwareFactorizationMachinePredictor CreatePredictor()
             Desc = Summary,
             UserName = UserName,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml' path='docs/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='docs/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
similarity index 100%
rename from src/Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml
rename to src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index a92ffc081a..3cf97ea801 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -30,8 +30,8 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = Microsoft.ML.Runtime.Learners.OlsLinearRegressionTrainer.Mkl;
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
-    /// <include file='./XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file='./doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
     public sealed partial class LogisticRegression : LbfgsTrainerBase<Float, ParameterMixingCalibratedPredictor>
     {
         public const string LoadNameValue = "LogisticRegression";
@@ -392,8 +392,8 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
                             
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index f608821708..66c1d41084 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -36,8 +36,8 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file = './XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
-    /// <include file = './XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+    /// <include file = './doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file = './doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
     public sealed class MulticlassLogisticRegression : LbfgsTrainerBase<VBuffer<Float>, MulticlassLogisticRegressionPredictor>
     {
         public const string LoadNameValue = "MultiClassLogisticRegression";
@@ -966,8 +966,8 @@ public partial class LogisticRegression
             Desc = Summary,
             UserName = MulticlassLogisticRegression.UserNameValue,
             ShortName = MulticlassLogisticRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
similarity index 100%
rename from src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml
rename to src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index d4910c142c..aa5ecb67a5 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -28,7 +28,7 @@ namespace Microsoft.ML.Runtime.Learners
     //     - Loss function. By default, hinge loss (aka max-margin avgd perceptron)
     //     - Feature normalization. By default, rescaling between min and max values for every feature
     //     - Prediction calibration to produce probabilities. Off by default, if on, uses exponential (aka Platt) calibration.
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="AP"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="AP"]/*' />
     public sealed class AveragedPerceptronTrainer :
         AveragedLinearTrainer<AveragedPerceptronTrainer.Arguments, LinearBinaryPredictor>
     {
@@ -94,7 +94,7 @@ public override LinearBinaryPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name=""AP""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name=""AP""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
index 076f247e1a..f345466e19 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
@@ -27,7 +27,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TPredictor = LinearRegressionPredictor;
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="OGD"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="OGD"]/*' />
     public sealed class OnlineGradientDescentTrainer : AveragedLinearTrainer<OnlineGradientDescentTrainer.Arguments, TPredictor>
     {
         internal const string LoadNameValue = "OnlineGradientDescent";
@@ -94,7 +94,7 @@ public override TPredictor CreatePredictor()
             Desc = "Train a Online gradient descent perceptron.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name=""OGD""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name=""OGD""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
similarity index 100%
rename from src/Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml
rename to src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index 970a1e62b2..9322c2cc75 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
     public sealed class PoissonRegression : LbfgsTrainerBase<Float, PoissonRegressionPredictor>
     {
         internal const string LoadNameValue = "PoissonRegression";
@@ -129,7 +129,7 @@ protected override void ProcessPriorDistribution(Float label, Float weight)
             Desc = "Train an Poisson regression model.", 
             UserName = UserNameValue, 
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml' path='docs/members/member[@name=""PoissonRegression""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='docs/members/member[@name=""PoissonRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
similarity index 100%
rename from src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml
rename to src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
index 6eb76ea2be..20bc349a7c 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.Learners
     using TVectorPredictor = IPredictorProducing<VBuffer<Float>>;
 
     // SDCA linear multiclass trainer.
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="SDCA"]/*' />
     public class SdcaMultiClassTrainer : SdcaTrainerBase<TVectorPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAMC";
@@ -389,7 +389,7 @@ public static partial class Sdca
             Desc = SdcaMultiClassTrainer.Summary,
             UserName = SdcaMultiClassTrainer.UserNameValue,
             ShortName = SdcaMultiClassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
index cfd1fb3682..512818bba7 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
@@ -25,7 +25,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TScalarPredictor = IPredictorWithFeatureWeights<Float>;
 
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="SDCA"]/*' />
     public sealed class SdcaRegressionTrainer : SdcaTrainerBase<IPredictor>, ITrainer<RoleMappedData, TScalarPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAR";
@@ -136,7 +136,7 @@ public static partial class Sdca
             Desc = SdcaRegressionTrainer.Summary,
             UserName = SdcaRegressionTrainer.UserNameValue,
             ShortName = SdcaRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/XMLDoc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
similarity index 100%
rename from src/Microsoft.ML.StandardLearners/Standard/XMLDoc.xml
rename to src/Microsoft.ML.StandardLearners/Standard/doc.xml
diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
index 93f1729a6b..f400d92a93 100644
--- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
@@ -19,7 +19,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
     public static class CategoricalHashTransform
     {
         public const int NumBitsLim = 31; // can't convert 31-bit hashes to indicator vectors, so max is 30
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index 553dc2b2f8..fc1382901b 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -21,7 +21,7 @@
 [assembly: LoadableClass(typeof(void), typeof(Categorical), null, typeof(SignatureEntryPointModule), "Categorical")]
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='./XMLDoc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+    /// <include file='./doc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
     public static class CategoricalTransform
     {
         public enum OutputKind : byte
@@ -246,7 +246,7 @@ public static class Categorical
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", 
             Desc = CategoricalTransform.Summary,
             UserName = CategoricalTransform.UserName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, CategoricalTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -261,7 +261,7 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", 
             Desc = CategoricalHashTransform.Summary,
             UserName = CategoricalHashTransform.UserName ,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, CategoricalHashTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/XMLDoc.xml b/src/Microsoft.ML.Transforms/doc.xml
similarity index 100%
rename from src/Microsoft.ML.Transforms/XMLDoc.xml
rename to src/Microsoft.ML.Transforms/doc.xml
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index 7b995dc511..b190750ca1 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -4095,7 +4095,7 @@ public sealed class Output
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name="AP"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name="AP"]/*' />
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4599,7 +4599,7 @@ public enum Bundle : byte
         }
 
 
-        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4890,7 +4890,7 @@ public FastForestBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5177,7 +5177,7 @@ public enum BoostedTreeArgsOptimizationAlgorithmType
         }
 
 
-        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5566,7 +5566,7 @@ public FastTreeBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5990,7 +5990,7 @@ public FastTreeRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6374,7 +6374,7 @@ public FastTreeRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/XMLDoc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
         public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6763,7 +6763,7 @@ public FastTreeTweedieRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/XMLDoc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
         public sealed partial class FieldAwareFactorizationMachineBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7190,7 +7190,7 @@ public enum KMeansPlusPlusTrainerInitAlgorithm
         }
 
 
-        /// <include file='../Microsoft.ML.KMeansClustering/XMLDoc.xml' path='docs/members/member[@name="KMeans++"]/*' />
+        /// <include file='../Microsoft.ML.KMeansClustering/doc.xml' path='docs/members/member[@name="KMeans++"]/*' />
         public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7306,7 +7306,7 @@ public enum LightGbmArgumentsEvalMetricType
         }
 
 
-        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7509,7 +7509,7 @@ public LightGbmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7712,7 +7712,7 @@ public LightGbmClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7915,7 +7915,7 @@ public LightGbmRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.LightGBM/XMLDoc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8253,8 +8253,8 @@ public LinearSvmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8402,8 +8402,8 @@ public LogisticRegressionBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/member[@name="LBFGS"]/*' />
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/XMLDoc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8624,7 +8624,7 @@ public NaiveBayesClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/XMLDoc.xml' path='docs/members/member[@name="OGD"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name="OGD"]/*' />
         public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8777,7 +8777,7 @@ public OnlineGradientDescentRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name="PCA"]/*' />
         public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8871,7 +8871,7 @@ public PcaAnomalyDetectorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/XMLDoc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
         public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9153,7 +9153,7 @@ public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9276,7 +9276,7 @@ public StochasticDualCoordinateAscentClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/XMLDoc.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9879,7 +9879,7 @@ public sealed partial class CategoricalHashTransformColumn : OneToOneColumn<Cate
 
         }
 
-        /// <include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
         public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10052,7 +10052,7 @@ public sealed partial class CategoricalTransformColumn : OneToOneColumn<Categori
 
         }
 
-        /// <include file='../Microsoft.ML.Transforms/XMLDoc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
         public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13983,7 +13983,7 @@ public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColu
 
         }
 
-        /// <include file='../Microsoft.ML.PCA/Pca.xml' path='docs/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name="PCA"]/*' />
         public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 

From c6ee265a0cad8ab8a5f1ffece28acb51a3ba12a2 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 11 Jul 2018 15:52:41 -0700
Subject: [PATCH 07/14] Adding documentation for the first group of transforms

---
 src/Microsoft.ML.Data/Transforms/NAFilter.cs  |   1 +
 src/Microsoft.ML.Data/Transforms/doc.xml      |  56 ++++++++
 .../TreeEnsembleFeaturizer.cs                 |   7 +-
 src/Microsoft.ML.FastTree/doc.xml             |  58 ++++++++
 .../Microsoft.ML.StandardLearners.csproj      |   1 +
 .../MultiClass/MultiClassNaiveBayesTrainer.cs |  11 +-
 .../Standard/MultiClass/Ova.cs                |   1 +
 .../Standard/MultiClass/Pkpd.cs               |  23 ++++
 .../Standard/MultiClass/doc.xml               |  59 ++++++++
 .../CountFeatureSelection.cs                  |   6 +-
 .../EntryPoints/SelectFeatures.cs             |  11 +-
 .../HashJoinTransform.cs                      |   7 +-
 .../MutualInformationFeatureSelection.cs      |   5 +-
 .../NADropTransform.cs                        |   4 +-
 .../NAHandleTransform.cs                      |  16 +--
 src/Microsoft.ML.Transforms/NAHandling.cs     |  24 +++-
 .../NAIndicatorTransform.cs                   |   5 +-
 .../OptionalColumnTransform.cs                |   8 +-
 src/Microsoft.ML.Transforms/doc.xml           | 130 ++++++++++++++++++
 .../Runtime/EntryPoints/OneVersusAllMacro.cs  |  18 +--
 20 files changed, 396 insertions(+), 55 deletions(-)
 create mode 100644 src/Microsoft.ML.Data/Transforms/doc.xml
 create mode 100644 src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml

diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs
index 7b94ff1e07..c8515291f3 100644
--- a/src/Microsoft.ML.Data/Transforms/NAFilter.cs
+++ b/src/Microsoft.ML.Data/Transforms/NAFilter.cs
@@ -26,6 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
+    /// <include file='doc.xml' path='doc/members/member[@name="NAFilter"]'/>
     public sealed class NAFilter : FilterBase
     {
         private static class Defaults
diff --git a/src/Microsoft.ML.Data/Transforms/doc.xml b/src/Microsoft.ML.Data/Transforms/doc.xml
new file mode 100644
index 0000000000..a7239debe1
--- /dev/null
+++ b/src/Microsoft.ML.Data/Transforms/doc.xml
@@ -0,0 +1,56 @@
+﻿<?xml version="1.0" encoding="utf-8" ?>
+<doc>
+  <members>
+    <member name="NAFilter">
+      <summary>
+        Removes missing values from vector type columns.
+      </summary>
+      <remarks>
+        This transform emoves the entire row if any of the input columns have a missing value in that row.
+        This preprocessing is required for many ML algorithms that cannot work with missing values.
+        Useful if any missing entry invalidates the entire row.
+        If the <see cref="Microsoft.ML.Runtime.Data.NAFilter.Defaults.Complement"/> is set to true, this transform would do the exact opposite,
+        it will keep only the rows that have missing values.
+      </remarks>
+      <seealso cref="Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+      <example>
+        <code>
+          pipeline.Add(new MissingValuesRowDropper(&quot;Column1&quot;));
+        </code>
+      </example>
+    </member>
+
+    <member name="NAHandle">
+      <summary>
+        Handle missing values by replacing them with either the default value or the indicated value. 
+      </summary>
+      <remarks>
+        This transform handles missing values in the input columns. For each input column, it creates an output column
+         where the missing values are replaced by one of these specified values:
+         <list type="bullet">
+           <item><description>The default value of the appropriate type.</description></item>
+           <item><description>The mean value of the appropriate type.</description></item>
+           <item><description>The max value of the appropriate type.</description></item>
+           <item><description>The min value of the appropriate type.</description></item>
+         </list>
+         <para>The last three work only for numeric/TimeSpan/DateTime kind columns.</para>       
+         <para> The output column can also optionally include an indicator vector for which slots were missing in the input column.
+         This can be done only when the indicator vector type can be converted to the input column type, i.e. only for numeric columns.
+         </para>
+         <para>
+           When computing the mean/max/min value, there is also an option to compute it over the whole column instead of per slot.
+           This option has a default value of true for variable length vectors, and false for known length vectors. 
+           It can be changed to true for known length vectors, but it results in an error if changed to false for variable length vectors.
+         </para>
+      </remarks>
+      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"/>
+      <seealso cref="Microsoft.ML.Data.DataKind"/>
+      <example>
+        <code>
+          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;) { ReplaceWith  = NAHandleTransformReplacementKind.Mean });
+        </code>
+      </example>
+    </member>
+    
+  </members>
+</doc>
diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
index a7044d2502..577b012815 100644
--- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
+++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
@@ -544,6 +544,7 @@ public ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema schema)
         }
     }
 
+    /// <include file='doc.xml' path='doc/members/member[@name="TreeEnsembleFeaturizerTransform"]'/>
     public static class TreeEnsembleFeaturizerTransform
     {
         public sealed class Arguments : TrainAndScoreTransform.ArgumentsBase<SignatureTreeEnsembleTrainer>
@@ -802,7 +803,11 @@ private static IDataView AppendLabelTransform(IHostEnvironment env, IChannel ch,
 
     public static partial class TreeFeaturize
     {
-        [TlcModule.EntryPoint(Name = "Transforms.TreeLeafFeaturizer", Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary, UserName = TreeEnsembleFeaturizerTransform.UserName, ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort)]
+        [TlcModule.EntryPoint(Name = "Transforms.TreeLeafFeaturizer", 
+            Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary, 
+            UserName = TreeEnsembleFeaturizerTransform.UserName, 
+            ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""TreeEnsembleFeaturizerTransform""]'/>" })]
         public static CommonOutputs.TransformOutput Featurizer(IHostEnvironment env, TreeEnsembleFeaturizerTransform.ArgumentsForEntryPoint input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/doc.xml b/src/Microsoft.ML.FastTree/doc.xml
index 36f0b41f24..2b87accd01 100644
--- a/src/Microsoft.ML.FastTree/doc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -73,6 +73,64 @@
         <para><a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a></para>
       </remarks>
     </member>
+
+    <member name="TreeEnsembleFeaturizerTransform">
+      <summary>
+        Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector
+        to three outputs:
+        <list>
+          <item>
+            <description>A vector containing the individual tree outputs of the tree ensemble.</description>
+          </item>
+          <item>
+            <description>A vector indicating the leaves that the feature vector falls on in the tree ensemble.</description>
+          </item>
+          <item>
+            <description>A vector indicating the paths that the feature vector falls on in the tree ensemble.</description>
+          </item>
+        </list>
+        If a both a model file and a trainer are specified - will use the model file. If neither are specified, 
+        will train a default FastTree model. 
+        This can handle key labels by training a regression model towards their optionally permuted indices.
+      </summary>
+      <remarks>
+        In machine learning​ it is a pretty common and powerful approach to utilize the already trained model in the process of defining features.
+        <para>A most obvious example could be to use the model's scores as features to downstream models. For example, we might run clustering on the original features, 
+        and use the cluster distances as the new feature set.
+        Instead of consuming the model's output, we could go deeper, and extract the 'intermediate outputs' that are used to produce the final score. </para>
+        There's a number of famous or popular examples of this technique:
+        <list>
+          <item>
+            <description>A deep neural net trained on the ImageNet dataset, with the last layer removed, is commonly used to compute the 'projection' of the image into the 'semantic feature space'.
+            It is observed that the Euclidian distance in this space often correlates with the 'semantic similarity': that is, all pictures of pizza are located close together,
+            and far away from pictures of kittens. </description>
+          </item>
+          <item>
+            <description>A matrix factorization and/or LDA model is also often used to extract the 'latent topics' or 'latent features' associated with users and items.</description>
+          </item>
+          <item>
+            <description>The weights of the linear model are often used as a crude indicator of 'feature importance'. At the very minimum, the 0-weight features are not needed by the model,
+            and there's no reason to compute them. </description>
+          </item>
+        </list>
+        <para>Tree featurizer uses the decision tree ensembles for feature engineering in the same fashion as above.</para>
+        <para>Let's assume that we've built a tree ensemble of 100 trees with 100 leaves each (it doesn't matter whether boosting was used or not in training). 
+        If we associate each leaf of each tree with a sequential integer, we can, for every incoming example x, 
+        produce an indicator vector L(x), where Li(x) = 1 if the example x 'falls' into the leaf #i, and 0 otherwise.</para>
+        <para>Thus, for every example x, we produce a 10000-valued vector L, with exactly 100 1s and the rest zeroes. 
+        This 'leaf indicator' vector can be considered the ensemble-induced 'footprint' of the example.</para>
+        <para>The 'distance' between two examples in the L-space is actually a Hamming distance, and is equal to the number of trees that do not distinguish the two examples.</para>
+        <para>We could repeat the same thought process for the non-leaf, or internal, nodes of the trees (we know that each tree has exactly 99 of them in our 100-leaf example), 
+        and produce another indicator vector, N (size 9900), for each example, indicating the 'trajectory' of each example through each of the trees.</para>
+        <para>The distance in the combined 19900-dimensional LN-space will be equal to the number of 'decisions' in all trees that 'agree' on the given pair of examples.</para>
+        <para>The TreeLeafFeaturizer is also producing the third vector, T, which is defined as Ti(x) = output of tree #i on example x.</para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new TreeLeafFeaturizer())
+        </code>
+      </example>
+    </member>
         
   </members>
 </docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
index 6bada43299..9702f66080 100644
--- a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
+++ b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
@@ -10,6 +10,7 @@
     <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
     <ProjectReference Include="..\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj" />
     <ProjectReference Include="..\Microsoft.ML.Data\Microsoft.ML.Data.csproj" />
+    <ProjectReference Include="..\Microsoft.ML.FastTree\Microsoft.ML.FastTree.csproj" />
     <ProjectReference Include="..\Microsoft.ML\Microsoft.ML.csproj" />
   </ItemGroup>
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
index 94efc8cf05..9fbb29b316 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
@@ -26,18 +26,13 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
+    /// <include file='doc.xml' path='doc/members/member[@name="MultiClassNaiveBayesTrainer"]' /> 
     public sealed class MultiClassNaiveBayesTrainer : TrainerBase<RoleMappedData, MultiClassNaiveBayesPredictor>
     {
         public const string LoadName = "MultiClassNaiveBayes";
         internal const string UserName = "Multiclass Naive Bayes";
         internal const string ShortName = "MNB";
         internal const string Summary = "Trains a multiclass Naive Bayes predictor that supports binary feature values.";
-        internal const string Remarks = @"<remarks>
-<a href ='https://en.wikipedia.org/wiki/Naive_Bayes_classifier'>Naive Bayes</a> is a probabilistic classifier that can be used for multiclass problems. 
-Using Bayes' theorem, the conditional probability for a sample belonging to a class can be calculated based on the sample count for each feature combination groups.
-However, Naive Bayes Classifier is feasible only if the number of features and the values each feature can take is relatively small.
-It also assumes that the features are strictly independent.
-</remarks>";
 
         public sealed class Arguments : LearnerInputBaseWithLabel
         {
@@ -132,7 +127,9 @@ public override MultiClassNaiveBayesPredictor CreatePredictor()
 
         [TlcModule.EntryPoint(Name = "Trainers.NaiveBayesClassifier",
             Desc = "Train a MultiClassNaiveBayesTrainer.",
-            UserName = UserName, ShortName = ShortName)]
+            UserName = UserName, 
+            ShortName = ShortName, 
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""MultiClassNaiveBayesTrainer""]'/>" } )]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClassNaiveBayesTrainer(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Ova.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Ova.cs
index 7b5fcc8a93..fd287c3d68 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Ova.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Ova.cs
@@ -36,6 +36,7 @@ namespace Microsoft.ML.Runtime.Learners
     using TScalarPredictor = IPredictorProducing<Float>;
     using TScalarTrainer = ITrainer<RoleMappedData, IPredictorProducing<Float>>;
 
+    /// <include file='doc.xml' path='doc/members/member[@name="OVA"]' /> 
     public sealed class Ova : MetaMulticlassTrainer<OvaPredictor, Ova.Arguments>
     {
         internal const string LoadNameValue = "OVA";
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
index cf1e7c062b..ffda5f9819 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
@@ -31,6 +31,29 @@ namespace Microsoft.ML.Runtime.Learners
     using TDistPredictor = IDistPredictorProducing<Float, Float>;
     using CR = RoleMappedSchema.ColumnRole;
 
+    /// <summary>
+    /// In this strategy, a binary classification algorithm is trained on each pair of classes. 
+    /// The pairs are unordered but created with replacement: so, if there were three classes, 0, 1,
+    /// 2, we would train classifiers for the pairs (0,0), (0,1), (0,2), (1,1), (1,2),
+    /// and(2,2). For each binary classifier, an input data point is considered a
+    /// positive example if it is in either of the two classes in the pair, and a
+    /// negative example otherwise.At prediction time, the probabilities for each
+    /// pair of classes is considered as the probability of being in either class of
+    /// the pair given the data, and the final predictive probabilities out of that
+    /// per class are calculated given the probability that an example is in any given
+    /// pair.
+    ///
+    /// These two can allow you to exploit trainers that do not naturally have a
+    /// multiclass option, e.g., using <see cref="Microsoft.ML.Runtime.FastTree.FastTreeBinaryClassificationTrainer"/> 
+    /// to solve a multiclass problem.
+    /// Alternately, it can allow ML.NET to solve a "simpler" problem even in the cases
+    /// where the trainer has a multiclass option, but using it directly is not
+    /// practical due to, usually, memory constraints.For example, while a multiclass
+    /// logistic regression is a more principled way to solve a multiclass problem, it
+    /// requires that the learner store a lot more intermediate state in the form of
+    /// L-BFGS history for all classes * simultaneously*, rather than just one-by-one
+    /// as would be needed for OVA.
+    /// </summary>
     public sealed class Pkpd : MetaMulticlassTrainer<PkpdPredictor, Pkpd.Arguments>
     {
         internal const string LoadNameValue = "PKPD";
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
new file mode 100644
index 0000000000..aeb9e99093
--- /dev/null
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
@@ -0,0 +1,59 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<docs>
+  <members>
+
+    <member name="MultiClassNaiveBayesTrainer">
+      <summary>
+        Trains a multiclass Naive Bayes predictor that supports binary feature values.
+      </summary>
+      <remarks>
+        <a href ='https://en.wikipedia.org/wiki/Naive_Bayes_classifier'>Naive Bayes</a> is a probabilistic classifier that can be used for multiclass problems.
+        Using Bayes' theorem, the conditional probability for a sample belonging to a class can be calculated based on the sample count for each feature combination groups.
+        However, Naive Bayes Classifier is feasible only if the number of features and the values each feature can take is relatively small.
+        It assumes independence among the presence of features in a class even though they may be dependent on each other.
+        This multi-class trainer accepts binary feature values of type float, i.e., feature values are either true or false.
+        Specifically a feature value greater than zero is treated as true.
+        These learner will request normalization from the data pipeline if the
+        classifier indicates it would benefit from it. Note that even if the
+        classifier indicates that it does not need caching, OVA will always
+        request caching, as it will be performing multiple passes over the data set.
+      </remarks>
+      <seealso cref='LogisticRegressionClassifier'></seealso>
+      <seealso cref='LightGbmClassifier'></seealso>
+      <seealso cref='StochasticDualCoordinateAscentClassifier'></seealso>
+      <seealso cref='OneVersusAll'></seealso>
+      <example>
+        <code>
+          pipeline.Add(new NaiveBayesClassifier(){ NormalizeFeatures = NormalizeOption.Auto, Caching = CachingOptions.Memory });
+        </code>
+      </example>
+    </member>
+
+    <member name="OVA">
+      <summary>
+        In this strategy, a binary classification algorithm is used to train one classifier for each class, which distinguishes that class from all other classes.
+        Prediction is then performed by running these binary classifiers, and choosing the prediction with the highest confidence score.
+      </summary>
+      <remarks>
+        <para>This algorithm can be treated as a wrapper for all the binary classifiers in ML.NET. 
+        A few binary classifiers already have implementation for multi-class problems, 
+        thus users can choose either one depending on the context. 
+        </para>
+        <para>
+          The OVA version of a binary classifier, such as wrapping a LightGbmBinaryClassifier ,
+          can be different from LightGbmClassifier, which develops a multi-class classifier directly. 
+        </para>
+      </remarks>
+      <seealso cref='LogisticRegressionClassifier'></seealso>
+      <seealso cref='LightGbmClassifier'></seealso>
+      <seealso cref='StochasticDualCoordinateAscentClassifier'></seealso>
+      <seealso cref='NaiveBayesClassifier'></seealso>
+      <example>
+        <code>
+          pipeline.Add(OneVersusAll.With(new StochasticDualCoordinateAscentBinaryClassifier()));
+        </code>
+      </example>
+    </member>
+   
+  </members>
+</docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs
index 79adda882e..c01508fc30 100644
--- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs
+++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs
@@ -18,11 +18,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// Selects the slots for which the count of non-default values is greater than a threshold.
-    /// Uses a set of aggregators to count the number of non-default values for each slot and
-    /// instantiates a DropSlots transform to actually drop the slots.
-    /// </summary>
+    /// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' /> 
     public static class CountFeatureSelectionTransform
     {
         public const string Summary = "Selects the slots for which the count of non-default values is greater than or equal to a threshold.";
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs b/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
index 583b7e00ad..d4791f59d5 100644
--- a/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
+++ b/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
@@ -11,7 +11,10 @@ namespace Microsoft.ML.Runtime.EntryPoints
 {
     public static class SelectFeatures
     {
-        [TlcModule.EntryPoint(Name = "Transforms.FeatureSelectorByCount", Desc = CountFeatureSelectionTransform.Summary, UserName = CountFeatureSelectionTransform.UserName)]
+        [TlcModule.EntryPoint(Name = "Transforms.FeatureSelectorByCount", 
+            Desc = CountFeatureSelectionTransform.Summary, 
+            UserName = CountFeatureSelectionTransform.UserName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CountFeatureSelection""]'/>" })]
         public static CommonOutputs.TransformOutput CountSelect(IHostEnvironment env, CountFeatureSelectionTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -23,7 +26,11 @@ public static CommonOutputs.TransformOutput CountSelect(IHostEnvironment env, Co
             return new CommonOutputs.TransformOutput { Model = new TransformModel(env, xf, input.Data), OutputData = xf };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.FeatureSelectorByMutualInformation", Desc = MutualInformationFeatureSelectionTransform.Summary, UserName = MutualInformationFeatureSelectionTransform.UserName, ShortName = MutualInformationFeatureSelectionTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.FeatureSelectorByMutualInformation", 
+            Desc = MutualInformationFeatureSelectionTransform.Summary, 
+            UserName = MutualInformationFeatureSelectionTransform.UserName, 
+            ShortName = MutualInformationFeatureSelectionTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""MutualInformationFeatureSelection""]'/>" })]
         public static CommonOutputs.TransformOutput MutualInformationSelect(IHostEnvironment env, MutualInformationFeatureSelectionTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/HashJoinTransform.cs b/src/Microsoft.ML.Transforms/HashJoinTransform.cs
index 098564bef3..25adc7858f 100644
--- a/src/Microsoft.ML.Transforms/HashJoinTransform.cs
+++ b/src/Microsoft.ML.Transforms/HashJoinTransform.cs
@@ -166,6 +166,7 @@ private static VersionInfo GetVersionInfo()
 
         private readonly ColumnInfoEx[] _exes;
 
+        /// <include file='doc.xml' path='docs/members/member[@name="HashJoin"]/*' />
         public HashJoinTransform(IHostEnvironment env, Arguments args, IDataView input)
             : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, input, TestColumnType)
         {
@@ -674,7 +675,11 @@ protected override ColumnType GetColumnTypeCore(int iinfo)
 
     public static class HashJoin
     {
-        [TlcModule.EntryPoint(Name = "Transforms.HashConverter", Desc = HashJoinTransform.Summary, UserName = HashJoinTransform.UserName, ShortName = HashJoinTransform.RegistrationName)]
+        [TlcModule.EntryPoint(Name = "Transforms.HashConverter", 
+            Desc = HashJoinTransform.Summary, 
+            UserName = HashJoinTransform.UserName, 
+            ShortName = HashJoinTransform.RegistrationName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""HashJoin""]/*' />" })]
         public static CommonOutputs.TransformOutput Apply(IHostEnvironment env, HashJoinTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
index d8c20f03ca..cd7fb2d12f 100644
--- a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
+++ b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
@@ -21,10 +21,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// Selects the top k slots ordered by their mutual information with the label column.
-    /// Instantiates a DropSlots transform to actually drop the slots.
-    /// </summary>
+    /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]' /> 
     public static class MutualInformationFeatureSelectionTransform
     {
         public const string Summary =
diff --git a/src/Microsoft.ML.Transforms/NADropTransform.cs b/src/Microsoft.ML.Transforms/NADropTransform.cs
index 347e889a5b..a87386e12a 100644
--- a/src/Microsoft.ML.Transforms/NADropTransform.cs
+++ b/src/Microsoft.ML.Transforms/NADropTransform.cs
@@ -21,9 +21,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// Transform to drop NAs from vector columns.
-    /// </summary>
+    /// <include file='doc.xml' path='docs/members/member[@name="NADrop"]'/>
     public sealed class NADropTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
diff --git a/src/Microsoft.ML.Transforms/NAHandleTransform.cs b/src/Microsoft.ML.Transforms/NAHandleTransform.cs
index 1b82fe3e1e..43666539be 100644
--- a/src/Microsoft.ML.Transforms/NAHandleTransform.cs
+++ b/src/Microsoft.ML.Transforms/NAHandleTransform.cs
@@ -17,21 +17,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// This transform handles missing values in the input columns. For each input column, it creates an output column
-    /// where the missing values are replaced by one of these specified values:
-    /// - The default value of the appropriate type.
-    /// - The mean value of the appropriate type.
-    /// - The max value of the appropriate type.
-    /// - The min value of the appropriate type.
-    /// (The last three work only for numeric/time span/ DateTime columns).
-    /// The output column can also optionally include an indicator vector for which slots were missing in the input column
-    /// (this can be done only when the indicator vector type can be converted to the input column type, i.e. only for numeric columns).
-    /// 
-    /// When computing the mean/max/min value, there is also an option to compute it over the whole column instead of per slot. This option
-    /// has a default value of true for variable length vectors, and false for known length vectors. It can be changed to true for known
-    /// length vectors, but it results in an error if changed to false for variable length vectors.
-    /// </summary>
+    /// <include file='doc.xml' path='doc/members/member[@name="NAHandle"]'/>
     public static class NAHandleTransform
     {
         public enum ReplacementKind
diff --git a/src/Microsoft.ML.Transforms/NAHandling.cs b/src/Microsoft.ML.Transforms/NAHandling.cs
index 2ed6830782..3ab3e7816d 100644
--- a/src/Microsoft.ML.Transforms/NAHandling.cs
+++ b/src/Microsoft.ML.Transforms/NAHandling.cs
@@ -11,7 +11,11 @@ namespace Microsoft.ML.Runtime.Data
 {
     public static class NAHandling
     {
-        [TlcModule.EntryPoint(Name = "Transforms.MissingValuesDropper", Desc = NADropTransform.Summary, UserName = NADropTransform.FriendlyName, ShortName = NADropTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.MissingValuesDropper", 
+            Desc = NADropTransform.Summary,
+            UserName = NADropTransform.FriendlyName, 
+            ShortName = NADropTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""NADrop""]/*' />" })]
         public static CommonOutputs.TransformOutput Drop(IHostEnvironment env, NADropTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, NADropTransform.ShortName, input);
@@ -23,7 +27,11 @@ public static CommonOutputs.TransformOutput Drop(IHostEnvironment env, NADropTra
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.MissingValuesRowDropper", Desc = NAFilter.Summary, UserName = NAFilter.FriendlyName, ShortName = NAFilter.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.MissingValuesRowDropper", 
+            Desc = NAFilter.Summary, 
+            UserName = NAFilter.FriendlyName, 
+            ShortName = NAFilter.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""NAFilter""]/*' />" })]
         public static CommonOutputs.TransformOutput Filter(IHostEnvironment env, NAFilter.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, NAFilter.ShortName, input);
@@ -35,7 +43,11 @@ public static CommonOutputs.TransformOutput Filter(IHostEnvironment env, NAFilte
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.MissingValueHandler", Desc = NAHandleTransform.Summary, UserName = NAHandleTransform.FriendlyName, ShortName = NAHandleTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.MissingValueHandler", 
+            Desc = NAHandleTransform.Summary, 
+            UserName = NAHandleTransform.FriendlyName, 
+            ShortName = NAHandleTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""NAFilter""]/*' />" })]
         public static CommonOutputs.TransformOutput Handle(IHostEnvironment env, NAHandleTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAHandle", input);
@@ -47,7 +59,11 @@ public static CommonOutputs.TransformOutput Handle(IHostEnvironment env, NAHandl
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.MissingValueIndicator", Desc = NAIndicatorTransform.Summary, UserName = NAIndicatorTransform.FriendlyName, ShortName = NAIndicatorTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.MissingValueIndicator", 
+            Desc = NAIndicatorTransform.Summary, 
+            UserName = NAIndicatorTransform.FriendlyName, 
+            ShortName = NAIndicatorTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""NAIndicator""]/*' />" })]
         public static CommonOutputs.TransformOutput Indicator(IHostEnvironment env, NAIndicatorTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAIndicator", input);
diff --git a/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs b/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs
index 38ecc2c817..82d79203dd 100644
--- a/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs
+++ b/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs
@@ -21,10 +21,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// This transform can transform either scalars or vectors (both fixed and variable size),
-    /// creating output columns that indicate corresponding NA values.
-    /// </summary>
+    /// <include file='doc.xml' path='docs/members/member[@name="NAIndicator"]'/>
     public sealed class NAIndicatorTransform : OneToOneTransformBase
     {
         public sealed class Column : OneToOneColumn
diff --git a/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs b/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
index 5d3ab591b2..dc1cda3ace 100644
--- a/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
+++ b/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
@@ -26,6 +26,7 @@
 
 namespace Microsoft.ML.Runtime.DataPipe
 {
+    /// <include file='./doc.xml' path='docs/members/member[@name="OptionalColumnTransform"]/*' />
     public class OptionalColumnTransform : RowToRowMapperTransformBase
     {
         public sealed class Arguments : TransformInputBase
@@ -459,7 +460,12 @@ private Delegate MakeGetterVec<T>(int length)
             }
         }
 
-        [TlcModule.EntryPoint(Desc = Summary, Name = "Transforms.OptionalColumnCreator", UserName = UserName, ShortName = ShortName)]
+        [TlcModule.EntryPoint(Desc = Summary, 
+            Name = "Transforms.OptionalColumnCreator", 
+            UserName = UserName, 
+            ShortName = ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""OptionalColumnTransform""]/*' />" })]
+
         public static CommonOutputs.TransformOutput MakeOptional(IHostEnvironment env, Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "OptionalColumn", input);
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index 7482c3a272..9fd37cd278 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -47,6 +47,136 @@
         </code>
       </example>
     </member>
+
+    <member name="CountFeatureSelection">
+      <summary>
+        Selects the slots for which the count of non-default values is greater than or equal to a threshold.
+      </summary>
+      <remarks>
+        <para>
+          This transform uses a set of aggregators to count the number of non-default values for each slot and
+          instantiates a <see cref="Microsoft.ML.Runtime.Data.DropSlotsTramsform"/> to actually drop the slots.
+          This transform is useful when applied together with a <see cref="Microsoft.ML.Transforms.CategoricalHashOneHotVectorizer"/>. 
+          The count feature selection can remove those features generated by the hash transform that have no data in the examples.
+        </para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new FeatureSelectorByCount() { Column = new[]{ &quot;Feature1&quot; }, Count = 2 });
+        </code>
+      </example>
+    </member>
+
+    <member name="MutualInformationFeatureSelection">
+      <summary>
+        Selects the top k slots across all specified columns ordered by their mutual information with the label column.
+      </summary>
+      <remarks>
+        <para>
+          The mutual information of two random variables X and Y is a measure of the mutual dependence between the variables.
+          Formally, the mutual information can be written as:
+        </para>
+          <para>I(X;Y) = E[log(p(x,y)) - log(p(x)) - log(p(y))]</para>
+        <para>where the expectation is taken over the joint distribution of X and Y. 
+        Here p(x,y) is the joint probability density function of X and Y, p(x) and p(y) are the marginal probability density functions of X and Y respectively. 
+        In general, a higher mutual information between the dependent variable (or label) and an independent variable (or feature) means 
+        that the label has higher mutual dependence over that feature.
+        The mutual information feature selection mode selects the features based on the mutual information. 
+        It keeps the top SlotsInOutput features with the largest mutual information with the label.
+        </para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new FeatureSelectorByMutualInformation() { Column = new[]{ &quot;Feature1&quot; }, SlotsInOutput = 6 });
+        </code>
+      </example>
+    </member>
+
+    <member name="MutualInformationFeatureSelection">
+      <summary>
+        Selects the top k slots across all specified columns ordered by their mutual information with the label column.
+      </summary>
+      <remarks>
+        <para>
+          The mutual information of two random variables X and Y is a measure of the mutual dependence between the variables.
+          Formally, the mutual information can be written as:
+        </para>
+        <para>I(X;Y) = E[log(p(x,y)) - log(p(x)) - log(p(y))]</para>
+        <para>
+          where the expectation is taken over the joint distribution of X and Y.
+          Here p(x,y) is the joint probability density function of X and Y, p(x) and p(y) are the marginal probability density functions of X and Y respectively.
+          In general, a higher mutual information between the dependent variable (or label) and an independent variable (or feature) means
+          that the label has higher mutual dependence over that feature.
+          The mutual information feature selection mode selects the features based on the mutual information.
+          It keeps the top SlotsInOutput features with the largest mutual information with the label.
+        </para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new FeatureSelectorByMutualInformation() { Column = new[]{ &quot;Feature1&quot;}, SlotsInOutput = 6 });
+        </code>
+      </example>
+    </member>
+
+    <member name="OptionalColumnTransform">
+      <summary>
+        If the user wish to create additional columns with a particular type and default values, 
+        or replicated the values from one column to another, changing their type, they can do so using this transform. 
+        This transform can be used as a workaround to create a Label column after deserializing a model, for prediction. 
+        Some transforms in the serialized model operate on the Label column, and would throw errors during prediction if such a column is not found. 
+      </summary>
+      <remarks>        
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new OptionalColumnCreator() { Column = new[]{ &quot;OptColumn&quot;} });
+        </code>
+      </example>
+    </member>
+
+    <member name="HashJoin">
+      <summary>
+        Converts multiple column values into hashes. 
+        This transform accepts both numeric and text inputs, both single and vector-valued columns. 
+      </summary>
+      <remarks>
+        This transform can be helpful for ranking and cross-validation. In the case of ranking, where the GroupIdColumn column is required,
+        and needs to be of a key type you can use the CategoricalHashOneHotVectorizer to hash the text value of a single GroupID column into a key value.
+        If the GroupID is the combination of the values from multiple columns, you can use the HashConverter to hash multiple text columns into one key column. 
+        Similarly with CrossValidator and the StratificationColumn. 
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new HashConverter(&quot;Column1&quot;, &quot;Column2&quot;));
+        </code>
+      </example>
+    </member>
+
+    <member name="NADrop">
+      <summary>
+        Removes missing values from vector type columns.
+      </summary>
+      <seealso cref="Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+      <example>
+        <code>
+          pipeline.Add(new MissingValuesDropper(&quot;Column1&quot;));
+        </code>
+      </example>
+    </member>
+
+
+    <member name="NAIndicator">
+      <summary>
+        This transform can transform either scalars or vectors (both fixed and variable size),
+        creating output columns that indicate, through the true/false booleans whether the row has a missing value.
+      </summary>
+      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+      <example>
+        <code>
+          pipeline.Add(new MissingValueIndicator(&quot;Column1&quot;));
+        </code>
+      </example>
+    </member>
   
   </members>
 </docs>
\ No newline at end of file
diff --git a/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs b/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
index e4a54de040..14a56da596 100644
--- a/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
+++ b/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
@@ -55,13 +55,13 @@ private static Tuple<List<EntryPointNode>, Var<IPredictorModel>> ProcessClass(IH
                 ClassIndex = k,
                 Column = new[]
                 {
-                            new ML.Transforms.LabelIndicatorTransformColumn
-                            {
-                                ClassIndex = k,
-                                Name = label,
-                                Source = label
-                            }
-                        },
+                    new ML.Transforms.LabelIndicatorTransformColumn
+                    {
+                        ClassIndex = k,
+                        Name = label,
+                        Source = label
+                    }
+                },
                 Data = { VarName = node.GetInputVariable(nameof(input.TrainingData)).ToJson() }
             };
             var exp = new Experiment(env);
@@ -134,7 +134,9 @@ private static int GetNumberOfClasses(IHostEnvironment env, Arguments input, out
             }
         }
 
-        [TlcModule.EntryPoint(Desc = "One-vs-All macro (OVA)", Name = "Models.OneVersusAll")]
+        [TlcModule.EntryPoint(Desc = "One-vs-All macro (OVA)",
+            Name = "Models.OneVersusAll",
+            XmlInclude = new[] { @"<include file='../../../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""OVA""]'/>" })]
         public static CommonOutputs.MacroOutput<Output> OVA(
             IHostEnvironment env,
             Arguments input,

From 781293d653744956d19fc55b988c5380590599ac Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Fri, 13 Jul 2018 09:35:42 -0700
Subject: [PATCH 08/14] adding more documentation. changing the root of the XML
 documents from docs -> doc, since its only one. Switching all <see href /> to
 the valid <see cref />

---
 .../Utilities/ReservoirSampler.cs             |   4 +-
 .../Evaluators/AucAggregator.cs               |   4 +-
 .../Transforms/TermTransform.cs               |  16 +-
 .../FastTreeArguments.cs                      |   2 +-
 .../FastTreeClassification.cs                 |   4 +-
 src/Microsoft.ML.FastTree/FastTreeRanking.cs  |   4 +-
 .../FastTreeRegression.cs                     |   4 +-
 src/Microsoft.ML.FastTree/FastTreeTweedie.cs  |   4 +-
 .../RandomForestClassification.cs             |   4 +-
 .../RandomForestRegression.cs                 |   4 +-
 .../Training/Parallel/IParallelTraining.cs    |  18 +-
 src/Microsoft.ML.FastTree/doc.xml             |   4 +-
 .../InternalStreams.cs                        |  26 +--
 .../KMeansPlusPlusTrainer.cs                  |   4 +-
 src/Microsoft.ML.KMeansClustering/doc.xml     |   4 +-
 .../LightGbmBinaryTrainer.cs                  |   4 +-
 .../LightGbmMulticlassTrainer.cs              |   4 +-
 .../LightGbmRankingTrainer.cs                 |   4 +-
 .../LightGbmRegressionTrainer.cs              |   4 +-
 src/Microsoft.ML.LightGBM/doc.xml             |   4 +-
 src/Microsoft.ML.PCA/PcaTrainer.cs            |   4 +-
 src/Microsoft.ML.PCA/PcaTransform.cs          |   4 +-
 src/Microsoft.ML.PCA/doc.xml                  |   6 +-
 .../FactorizationMachineTrainer.cs            |   4 +-
 .../FactorizationMachine/doc.xml              |   4 +-
 .../Standard/LinearClassificationTrainer.cs   |   2 +-
 .../LogisticRegression/LogisticRegression.cs  |   6 +-
 .../MulticlassLogisticRegression.cs           |   4 +-
 .../Standard/LogisticRegression/doc.xml       |   4 +-
 .../Standard/MultiClass/doc.xml               |  20 +-
 .../Standard/Online/AveragedPerceptron.cs     |   4 +-
 .../Standard/Online/OnlineGradientDescent.cs  |   4 +-
 .../Standard/Online/doc.xml                   |   4 +-
 .../PoissonRegression/PoissonRegression.cs    |   4 +-
 .../Standard/PoissonRegression/doc.xml        |   4 +-
 .../Standard/SdcaMultiClass.cs                |   4 +-
 .../Standard/SdcaRegression.cs                |   4 +-
 .../Standard/doc.xml                          |   4 +-
 .../CategoricalHashTransform.cs               |   2 +-
 .../CategoricalTransform.cs                   |  16 +-
 .../EntryPoints/TextAnalytics.cs              |  44 ++++-
 src/Microsoft.ML.Transforms/GcnTransform.cs   |  14 +-
 src/Microsoft.ML.Transforms/GroupTransform.cs |   6 +-
 .../HashJoinTransform.cs                      |   4 +-
 .../NADropTransform.cs                        |   2 +-
 src/Microsoft.ML.Transforms/NAHandling.cs     |  10 +-
 .../NAIndicatorTransform.cs                   |   2 +-
 .../NAReplaceTransform.cs                     |  15 +-
 .../OptionalColumnTransform.cs                |   4 +-
 .../Text/LdaTransform.cs                      |  33 ++--
 .../Text/SentimentAnalyzerTransform.cs        |   1 +
 .../Text/TextTransform.cs                     |   9 +-
 .../Text/WordTokenizeTransform.cs             |   9 +-
 src/Microsoft.ML.Transforms/Text/doc.xml      | 185 ++++++++++++++++++
 .../UngroupTransform.cs                       |  50 ++---
 .../WhiteningTransform.cs                     |   2 +-
 src/Microsoft.ML.Transforms/doc.xml           | 179 ++++++++++++++++-
 src/Microsoft.ML/CSharpApi.cs                 | 144 +++++---------
 src/Microsoft.ML/Models/OnnxConverter.cs      |   2 +-
 .../Runtime/EntryPoints/OneVersusAllMacro.cs  |   2 +-
 60 files changed, 653 insertions(+), 298 deletions(-)
 create mode 100644 src/Microsoft.ML.Transforms/Text/doc.xml

diff --git a/src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs b/src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs
index a755788fb4..b8006bd943 100644
--- a/src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs
+++ b/src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs
@@ -47,7 +47,7 @@ public interface IReservoirSampler<T>
     /// This class produces a sample without replacement from a stream of data of type <typeparamref name="T"/>. 
     /// It is instantiated with a delegate that gets the next data point, and builds a reservoir in one pass by calling <see cref="Sample"/> 
     /// for every data point in the stream. In case the next data point does not get 'picked' into the reservoir, the delegate is not invoked.
-    /// Sampling is done according to the algorithm in this paper: <see href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53"/>.
+    /// Sampling is done according to the algorithm in this paper: <a href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53">http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53</a>.
     /// </summary>
     public sealed class ReservoirSamplerWithoutReplacement<T> : IReservoirSampler<T>
     {
@@ -120,7 +120,7 @@ public IEnumerable<T> GetSample()
     /// This class produces a sample with replacement from a stream of data of type <typeparamref name="T"/>. 
     /// It is instantiated with a delegate that gets the next data point, and builds a reservoir in one pass by calling <see cref="Sample"/> 
     /// for every data point in the stream. In case the next data point does not get 'picked' into the reservoir, the delegate is not invoked.
-    /// Sampling is done according to the algorithm in this paper: <see href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53"/>.
+    /// Sampling is done according to the algorithm in this paper: <a href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53">http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53</a>.
     /// </summary>
     public sealed class ReservoirSamplerWithReplacement<T> : IReservoirSampler<T>
     {
diff --git a/src/Microsoft.ML.Data/Evaluators/AucAggregator.cs b/src/Microsoft.ML.Data/Evaluators/AucAggregator.cs
index f45aacd58e..342e1d3529 100644
--- a/src/Microsoft.ML.Data/Evaluators/AucAggregator.cs
+++ b/src/Microsoft.ML.Data/Evaluators/AucAggregator.cs
@@ -408,7 +408,7 @@ public UnweightedAuPrcAggregator(IRandom rand, int reservoirSize)
 
             /// <summary>
             /// Compute the AUPRC using the "lower trapesoid" estimator, as described in the paper
-            /// <see href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf"/>.
+            /// <a href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf">http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf</a>.
             /// </summary>
             protected override Double ComputeWeightedAuPrcCore(out Double unweighted)
             {
@@ -482,7 +482,7 @@ public WeightedAuPrcAggregator(IRandom rand, int reservoirSize)
 
             /// <summary>
             /// Compute the AUPRC using the "lower trapesoid" estimator, as described in the paper
-            /// <see href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf"/>.
+            /// <a href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf">http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf</a>.
             /// </summary>
             protected override Double ComputeWeightedAuPrcCore(out Double unweighted)
             {
diff --git a/src/Microsoft.ML.Data/Transforms/TermTransform.cs b/src/Microsoft.ML.Data/Transforms/TermTransform.cs
index 7591179588..ea2248716e 100644
--- a/src/Microsoft.ML.Data/Transforms/TermTransform.cs
+++ b/src/Microsoft.ML.Data/Transforms/TermTransform.cs
@@ -29,14 +29,14 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// TermTransform builds up term vocabularies (dictionaries).
-    /// Notes:
-    /// * Each column builds/uses exactly one "vocabulary" (dictionary).
-    /// * Output columns are KeyType-valued.
-    /// * The Key value is the one-based index of the item in the dictionary.
-    /// * Not found is assigned the value zero.
-    /// </summary>
+
+    // TermTransform builds up term vocabularies (dictionaries).
+    // Notes:
+    // * Each column builds/uses exactly one "vocabulary" (dictionary).
+    // * Output columns are KeyType-valued.
+    // * The Key value is the one-based index of the item in the dictionary.
+    // * Not found is assigned the value zero.
+    /// <include file='doc.xml' path='doc/members/member[@name="TextToKey"]/*' />
     public sealed partial class TermTransform : OneToOneTransformBase, ITransformTemplate
     {
         public abstract class ColumnBase : OneToOneColumn
diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
index e6274e3155..2c6fc02745 100644
--- a/src/Microsoft.ML.FastTree/FastTreeArguments.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
@@ -20,7 +20,7 @@ public interface IFastTreeTrainerFactory : IComponentFactory<ITrainer>
     {
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer
     {
         [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)]
diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
index 18f61e6dbe..1eb2a7b6f9 100644
--- a/src/Microsoft.ML.FastTree/FastTreeClassification.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
@@ -100,7 +100,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
-    /// <include file = './doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file = './doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer :
         BoostingFastTreeTrainerBase<FastTreeBinaryClassificationTrainer.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -346,7 +346,7 @@ public static partial class FastTree
             Desc = FastTreeBinaryClassificationTrainer.Summary,
             UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
             ShortName = FastTreeBinaryClassificationTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
index 6eabca8c78..6a17dc2d13 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRanking.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
@@ -38,7 +38,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRankingTrainer : BoostingFastTreeTrainerBase<FastTreeRankingTrainer.Arguments, FastTreeRankingPredictor>,
         IHasLabelGains
     {
@@ -1101,7 +1101,7 @@ public static partial class FastTree
             Desc = FastTreeRankingTrainer.Summary,
             UserName = FastTreeRankingTrainer.UserNameValue,
             ShortName = FastTreeRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
index 308437440a..c625594946 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRegression.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
@@ -31,7 +31,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRegressionTrainer : BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Arguments, FastTreeRegressionPredictor>
     {
         public const string LoadNameValue = "FastTreeRegression";
@@ -453,7 +453,7 @@ public static partial class FastTree
             Desc = FastTreeRegressionTrainer.Summary,
             UserName = FastTreeRegressionTrainer.UserNameValue,
             ShortName = FastTreeRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
index b43c499a44..c77f3532bd 100644
--- a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
@@ -30,7 +30,7 @@ namespace Microsoft.ML.Runtime.FastTree
     // The Tweedie boosting model follows the mathematics established in:
     // Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
     // https://arxiv.org/pdf/1508.06378.pdf
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTreeTweedieRegression"]/*' />
     public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase<FastTreeTweedieTrainer.Arguments, FastTreeTweediePredictor>
     {
         public const string LoadNameValue = "FastTreeTweedieRegression";
@@ -461,7 +461,7 @@ public static partial class FastTree
             Desc = FastTreeTweedieTrainer.Summary,
             UserName = FastTreeTweedieTrainer.UserNameValue,
             ShortName = FastTreeTweedieTrainer.ShortName,
-            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
+            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
index 8cd62ceb77..86daffd712 100644
--- a/src/Microsoft.ML.FastTree/RandomForestClassification.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
@@ -106,7 +106,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         }
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestClassification :
         RandomForestTrainerBase<FastForestClassification.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -213,7 +213,7 @@ public static partial class FastForest
             Desc = FastForestClassification.Summary,
             UserName = FastForestClassification.UserNameValue,
             ShortName = FastForestClassification.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
index f501037df3..68817b420d 100644
--- a/src/Microsoft.ML.FastTree/RandomForestRegression.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
@@ -137,7 +137,7 @@ public ISchemaBindableMapper CreateMapper(Double[] quantiles)
         }
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestRegression : RandomForestTrainerBase<FastForestRegression.Arguments, FastForestRegressionPredictor>
     {
         public sealed class Arguments : FastForestArgumentsBase
@@ -285,7 +285,7 @@ public static partial class FastForest
             Desc = FastForestRegression.Summary,
             UserName = FastForestRegression.LoadNameValue,
             ShortName = FastForestRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs b/src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs
index 525e60947b..08ae6fb16f 100644
--- a/src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs
+++ b/src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs
@@ -33,20 +33,20 @@ public delegate void FindBestThresholdFromRawArrayFun(LeafSplitCandidates leafSp
     /// <summary>
     /// Interface used for parallel training.
     /// Mainly contains three parts:
-    /// 1. interactive with IO: <see href="GetLocalBinConstructionFeatures" />, <see href="SyncGlobalBoundary" />.
+    /// 1. interactive with IO: <see cref="GetLocalBinConstructionFeatures" />, <see cref="SyncGlobalBoundary" />.
     ///    Data will be partitioned by rows in Data parallel and Voting Parallel.
     ///    To speed up the find bin process, it let different workers to find bins for different features.
     ///    Then perform global sync up.
     ///    In Feature parallel, every machines holds all data, so this is unneeded.
-    /// 2. interactive with TreeLearner: <see href="InitIteration" />, <see href="CacheHistogram" />, <see href="IsNeedFindLocalBestSplit" />, 
-    ///        <see href="IsSkipNonSplittableHistogram" />, <see href="FindGlobalBestSplit" />, <see href="GetGlobalDataCountInLeaf" />, <see href="PerformGlobalSplit" />.
+    /// 2. interactive with TreeLearner: <see cref="InitIteration" />, <see cref="CacheHistogram" />, <see cref="IsNeedFindLocalBestSplit" />, 
+    ///        <see cref="IsSkipNonSplittableHistogram" />, <see cref="FindGlobalBestSplit" />, <see cref="GetGlobalDataCountInLeaf" />, <see cref="PerformGlobalSplit" />.
     ///    A full process is:
-    ///        Use <see href="InitIteration" /> to alter local active features.
-    ///        Use <see href="GetGlobalDataCountInLeaf" /> to check smaller leaf and larger leaf.
-    ///        Use <see href="CacheHistogram" />, <see href="IsNeedFindLocalBestSplit" /> and <see href="IsSkipNonSplittableHistogram" /> to interactive with Feature histograms.
-    ///        Use <see href="FindGlobalBestSplit" /> to sync up global best split
-    ///        Use <see href="PerformGlobalSplit" /> to record global num_data in leaves.
-    /// 3. interactive with Application : <see href="GlobalMean" />.
+    ///        Use <see cref="InitIteration" /> to alter local active features.
+    ///        Use <see cref="GetGlobalDataCountInLeaf" /> to check smaller leaf and larger leaf.
+    ///        Use <see cref="CacheHistogram" />, <see cref="IsNeedFindLocalBestSplit" /> and <see cref="IsSkipNonSplittableHistogram" /> to interactive with Feature histograms.
+    ///        Use <see cref="FindGlobalBestSplit" /> to sync up global best split
+    ///        Use <see cref="PerformGlobalSplit" /> to record global num_data in leaves.
+    /// 3. interactive with Application : <see cref="GlobalMean" />.
     ///    Output of leaves is calculated by newton step ( - sum(first_order_gradients) / sum(second_order_gradients)).
     ///    If data is partitioned by row, it needs to a sync up for these sum result.
     ///    So It needs to call this to get the real output of leaves.
diff --git a/src/Microsoft.ML.FastTree/doc.xml b/src/Microsoft.ML.FastTree/doc.xml
index 2b87accd01..0990c8c3dc 100644
--- a/src/Microsoft.ML.FastTree/doc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
 
     <member name="FastTree">
@@ -133,4 +133,4 @@
     </member>
         
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.InternalStreams/InternalStreams.cs b/src/Microsoft.ML.InternalStreams/InternalStreams.cs
index 76cd8413f6..2963bf4603 100644
--- a/src/Microsoft.ML.InternalStreams/InternalStreams.cs
+++ b/src/Microsoft.ML.InternalStreams/InternalStreams.cs
@@ -6086,9 +6086,9 @@ public long Length
     /// </list>
     /// <para>
     /// Compression support relies on executable utilities to be in the path.
-    /// See <see href="http://7-zip.org"/> for 7z.exe and 7za.exe (for many formats -
-    /// .7z, .gz, .zip, .rar, .bz2, .cab, .arj), <see href="http://gnuwin32.sourceforge.net/packages/gzip.htm"/> for gzip.exe
-    /// (for .gz), or <see href="http://rarsoft.com"/> for unrar.exe (for .rar).
+    /// See <a href="http://7-zip.org">http://7-zip.org</a> for 7z.exe and 7za.exe (for many formats -
+    /// .7z, .gz, .zip, .rar, .bz2, .cab, .arj), <a href="http://gnuwin32.sourceforge.net/packages/gzip.htm">http://gnuwin32.sourceforge.net/packages/gzip.htm</a> for gzip.exe
+    /// (for .gz), or <a href="http://rarsoft.com">http://rarsoft.com</a> for unrar.exe (for .rar).
     /// </para>
     /// </remarks>
 #else
@@ -6771,8 +6771,8 @@ protected override void Dispose(bool disposing)
     /// </list>
     /// <para>
     /// Compression support relies on executable utilities to be in the path.
-    /// See <see href="http://7-zip.org"/> for 7z.exe and 7za.exe (for
-    /// .7z, .gz), <see href="http://gnuwin32.sourceforge.net/packages/gzip.htm"/> for gzip.exe
+    /// See <a href="http://7-zip.org">http://7-zip.org</a>for 7z.exe and 7za.exe (for
+    /// .7z, .gz), <a href="http://gnuwin32.sourceforge.net/packages/gzip.htm">http://gnuwin32.sourceforge.net/packages/gzip.htm</a> for gzip.exe
     /// (for .gz).
     /// </para>
     /// </remarks>
@@ -7322,9 +7322,9 @@ public static StreamWriter OpenUnbuffered(string outFileName, bool append, bool
     /// </list>
     /// <para>
     /// Compression support relies on executable utilities to be in the path.
-    /// See <see href="http://7-zip.org"/> for 7z.exe and 7za.exe (for many formats -
-    /// .7z, .gz, .zip, .rar, .bz2, .cab, .arj), <see href="http://gnuwin32.sourceforge.net/packages/gzip.htm"/> for gzip.exe
-    /// (for .gz), or <see href="http://rarsoft.com"/> for unrar.exe (for .rar).
+    /// See <a href="http://7-zip.org">http://7-zip.org</a> for 7z.exe and 7za.exe (for many formats -
+    /// .7z, .gz, .zip, .rar, .bz2, .cab, .arj), <a href="http://gnuwin32.sourceforge.net/packages/gzip.htm">http://gnuwin32.sourceforge.net/packages/gzip.htm</a> for gzip.exe
+    /// (for .gz), or <a href="http://rarsoft.com">http://rarsoft.com</a> for unrar.exe (for .rar).
     /// </para>
     /// </remarks>
 #else
@@ -8383,8 +8383,8 @@ public static Stream OpenUnbuffered(string fileName, bool async)
     /// </list>
     /// <para>
     /// Compression support relies on executable utilities to be in the path.
-    /// See <see href="http://7-zip.org"/> for 7z.exe and 7za.exe (for
-    /// .7z, .gz), <see href="http://gnuwin32.sourceforge.net/packages/gzip.htm"/> for gzip.exe
+    /// See <a href="http://7-zip.org">http://7-zip.org</a> for 7z.exe and 7za.exe (for
+    /// .7z, .gz), <a href="http://gnuwin32.sourceforge.net/packages/gzip.htm">http://gnuwin32.sourceforge.net/packages/gzip.htm</a> for gzip.exe
     /// (for .gz).
     /// </para>
     /// </remarks>
@@ -13354,9 +13354,9 @@ IEnumerator IEnumerable.GetEnumerator()
     /// </para>
     /// <para>
     /// Compression support relies on executable utilities to be in the path.
-    /// See <see href="http://7-zip.org"/> for 7z.exe and 7za.exe (for many formats -
-    /// .7z, .gz, .zip, .rar, .bz2, .cab, .arj), <see href="http://gnuwin32.sourceforge.net/packages/gzip.htm"/> for gzip.exe
-    /// (for .gz), or <see href="http://rarsoft.com"/> for unrar.exe (for .rar). Gzip support built-in to .NET
+    /// See <a href="http://7-zip.org">http://7-zip.org</a> for 7z.exe and 7za.exe (for many formats -
+    /// .7z, .gz, .zip, .rar, .bz2, .cab, .arj), <a href="http://gnuwin32.sourceforge.net/packages/gzip.htm">http://gnuwin32.sourceforge.net/packages/gzip.htm</a> for gzip.exe
+    /// (for .gz), or <a href="http://rarsoft.com">http://rarsoft.com</a> for unrar.exe (for .rar). Gzip support built-in to .NET
     /// 2.0 can be used, but it has extreme deficiencies in terms of speed, size, and flexibility.
     /// </para>
     /// <para>
diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
index dce7be48d2..66dcfaaeca 100644
--- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
+++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
@@ -28,7 +28,7 @@
 
 namespace Microsoft.ML.Runtime.KMeans
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="KMeans++"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="KMeans++"]/*' />
     public class KMeansPlusPlusTrainer : TrainerBase<RoleMappedData, KMeansPredictor>
     {
         public const string LoadNameValue = "KMeansPlusPlus";
@@ -213,7 +213,7 @@ private static int ComputeNumThreads(IHost host, int? argNumThreads)
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/doc.xml' path='docs/members/member[@name=""KMeans++""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/member[@name=""KMeans++""]/*' />" })]
         public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.KMeansClustering/doc.xml b/src/Microsoft.ML.KMeansClustering/doc.xml
index affaeabf98..9c26056811 100644
--- a/src/Microsoft.ML.KMeansClustering/doc.xml
+++ b/src/Microsoft.ML.KMeansClustering/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
 
     <member name="KMeans++">
@@ -19,4 +19,4 @@
     </member>
    
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
index 54cd523e72..6806b00110 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
@@ -81,7 +81,7 @@ public static IPredictorProducing<float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmBinaryTrainer : LightGbmTrainerBase<float, IPredictorWithFeatureWeights<float>>
     {
         internal const string UserName = "LightGBM Binary Classifier";
@@ -134,7 +134,7 @@ public static partial class LightGbm
             Desc = LightGbmBinaryTrainer.Summary,
             UserName = LightGbmBinaryTrainer.UserName, 
             ShortName = LightGbmBinaryTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
index 2a84bad0e8..858013388c 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
@@ -18,7 +18,7 @@
 namespace Microsoft.ML.Runtime.LightGBM
 {
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmMulticlassTrainer : LightGbmTrainerBase<VBuffer<float>, OvaPredictor>
     {
         public const string Summary = "LightGBM Multi Class Classifier";
@@ -185,7 +185,7 @@ public static partial class LightGbm
             Desc = "Train a LightGBM multi class model.", 
             UserName = LightGbmMulticlassTrainer.Summary, 
             ShortName = LightGbmMulticlassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
index 4a1d1634a8..659e4239d7 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
@@ -71,7 +71,7 @@ public static LightGbmRankingPredictor Create(IHostEnvironment env, ModelLoadCon
         public override PredictionKind PredictionKind { get { return PredictionKind.Ranking; } }
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, LightGbmRankingPredictor>
     {
         public const string UserName = "LightGBM Ranking";
@@ -132,7 +132,7 @@ public static partial class LightGbm
             Desc = "Train a LightGBM ranking model.", 
             UserName = LightGbmRankingTrainer.UserName, 
             ShortName = LightGbmRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
index 6ae3da792a..13c254a509 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
@@ -20,7 +20,7 @@
 
 namespace Microsoft.ML.Runtime.LightGBM
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
     public sealed class LightGbmRegressionPredictor : FastTreePredictionWrapper
     {
         public const string LoaderSignature = "LightGBMRegressionExec";
@@ -124,7 +124,7 @@ public static partial class LightGbm
             Desc = LightGbmRegressorTrainer.Summary, 
             UserName = LightGbmRegressorTrainer.UserNameValue, 
             ShortName = LightGbmRegressorTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/doc.xml b/src/Microsoft.ML.LightGBM/doc.xml
index 4d53265ae3..b32473485a 100644
--- a/src/Microsoft.ML.LightGBM/doc.xml
+++ b/src/Microsoft.ML.LightGBM/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
     
     <member name="LightGBM">
@@ -13,4 +13,4 @@
     </member>
 
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index 23e7351a86..840744f6f3 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -288,7 +288,7 @@ private static void PostProcess(VBuffer<Float>[] y, Float[] sigma, Float[] z, in
             Desc = "Train an PCA Anomaly model.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -308,7 +308,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm
     // - - If the error is close to 0, the instance is considered normal (non-anomaly).
     // REVIEW: move the predictor to a different file and fold EigenUtils.cs to this file.
     // REVIEW: Include the above detail in the XML documentation file. 
-    /// <include file='./doc.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="PCA"]/*' />
     public sealed class PcaPredictor : PredictorBase<Float>,
         IValueMapper,
         ICanGetSummaryAsIDataView,
diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs
index 0807abc5ed..3973d443b8 100644
--- a/src/Microsoft.ML.PCA/PcaTransform.cs
+++ b/src/Microsoft.ML.PCA/PcaTransform.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="PCA"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="PCA"]/*' />
     public sealed class PcaTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
@@ -541,7 +541,7 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer<Float>
             Desc = Summary,
             UserName = UserName, 
             ShortName = ShortName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name=""PCA""]/*' />" })]
         public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input);
diff --git a/src/Microsoft.ML.PCA/doc.xml b/src/Microsoft.ML.PCA/doc.xml
index 98d423b754..42d3218c0f 100644
--- a/src/Microsoft.ML.PCA/doc.xml
+++ b/src/Microsoft.ML.PCA/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
     
     <member name="PCA">
@@ -9,7 +9,7 @@
       <remarks>
       <a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace.
       Its training is done using the technique described in the paper: <a href='https://arxiv.org/pdf/1310.6304v2.pdf'>Combining Structured and Unstructured Randomness in Large Scale PCA</a>,
-      and the paper <see href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</see>
+      and the paper <a href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
       <a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
       <a href='https://arxiv.org/abs/0809.2274'>A randomized algorithm for principal component analysis</a>
       <a href='http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
@@ -24,4 +24,4 @@
     </member>
     
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
index b967bd9f95..f14138d881 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.FactorizationMachine
      [2] http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
      [3] https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
     */
-    /// <include file='./doc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
     public sealed class FieldAwareFactorizationMachineTrainer : TrainerBase<RoleMappedData, FieldAwareFactorizationMachinePredictor>,
         IIncrementalTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>, IValidatingTrainer<RoleMappedData>,
         IIncrementalValidatingTrainer<RoleMappedData, FieldAwareFactorizationMachinePredictor>
@@ -407,7 +407,7 @@ public override FieldAwareFactorizationMachinePredictor CreatePredictor()
             Desc = Summary,
             UserName = UserName,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='docs/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
index 2e72b2ea9f..5d90ea802c 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
     
     <member name="FieldAwareFactorizationMachineBinaryClassifier">
@@ -39,4 +39,4 @@
     </member>
         
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
index 554babf1ce..90e9f60a20 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
@@ -1779,7 +1779,7 @@ public static partial class Sdca
             Desc = "Train an SDCA binary model.",
             UserName = LinearClassificationTrainer.UserNameValue,
             ShortName = LinearClassificationTrainer.LoadNameValue,
-            XmlInclude = new[] { @"<include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name=""StochasticDualCoordinateAscentBinaryClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LinearClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 3cf97ea801..1d63d66d2d 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -30,8 +30,8 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using Mkl = Microsoft.ML.Runtime.Learners.OlsLinearRegressionTrainer.Mkl;
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
-    /// <include file='./doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
+    /// <include file='doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
     public sealed partial class LogisticRegression : LbfgsTrainerBase<Float, ParameterMixingCalibratedPredictor>
     {
         public const string LoadNameValue = "LogisticRegression";
@@ -392,7 +392,7 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name=""LBFGS""]/*' />",
                                  @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
                             
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 66c1d41084..4ddb873f6e 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -36,7 +36,7 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file = './doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+    /// <include file = './doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
     /// <include file = './doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
     public sealed class MulticlassLogisticRegression : LbfgsTrainerBase<VBuffer<Float>, MulticlassLogisticRegressionPredictor>
     {
@@ -966,7 +966,7 @@ public partial class LogisticRegression
             Desc = Summary,
             UserName = MulticlassLogisticRegression.UserNameValue,
             ShortName = MulticlassLogisticRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name=""LBFGS""]/*' />",
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name=""LBFGS""]/*' />",
                                  @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
index 5ac68e2fc0..9d1e38f237 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
 
     <member name="LBFGS">
@@ -64,4 +64,4 @@
     </example>
    
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
index aeb9e99093..68faa53632 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
 
     <member name="MultiClassNaiveBayesTrainer">
@@ -18,10 +18,10 @@
         classifier indicates that it does not need caching, OVA will always
         request caching, as it will be performing multiple passes over the data set.
       </remarks>
-      <seealso cref='LogisticRegressionClassifier'></seealso>
-      <seealso cref='LightGbmClassifier'></seealso>
-      <seealso cref='StochasticDualCoordinateAscentClassifier'></seealso>
-      <seealso cref='OneVersusAll'></seealso>
+      <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'></seealso>
+      <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'></seealso>
+      <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'></seealso>
+      <seealso cref='Microsoft.ML.Models.OneVersusAll'></seealso>
       <example>
         <code>
           pipeline.Add(new NaiveBayesClassifier(){ NormalizeFeatures = NormalizeOption.Auto, Caching = CachingOptions.Memory });
@@ -44,10 +44,10 @@
           can be different from LightGbmClassifier, which develops a multi-class classifier directly. 
         </para>
       </remarks>
-      <seealso cref='LogisticRegressionClassifier'></seealso>
-      <seealso cref='LightGbmClassifier'></seealso>
-      <seealso cref='StochasticDualCoordinateAscentClassifier'></seealso>
-      <seealso cref='NaiveBayesClassifier'></seealso>
+      <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'>LogisticRegressionClassifier</seealso>
+      <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'>LightGbmClassifier</seealso>
+      <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'>StochasticDualCoordinateAscentClassifier</seealso>
+      <seealso cref='Microsoft.ML.Trainers.NaiveBayesClassifier'>NaiveBayesClassifier</seealso>
       <example>
         <code>
           pipeline.Add(OneVersusAll.With(new StochasticDualCoordinateAscentBinaryClassifier()));
@@ -56,4 +56,4 @@
     </member>
    
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index aa5ecb67a5..371a5bde58 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -28,7 +28,7 @@ namespace Microsoft.ML.Runtime.Learners
     //     - Loss function. By default, hinge loss (aka max-margin avgd perceptron)
     //     - Feature normalization. By default, rescaling between min and max values for every feature
     //     - Prediction calibration to produce probabilities. Off by default, if on, uses exponential (aka Platt) calibration.
-    /// <include file='./doc.xml' path='docs/members/member[@name="AP"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="AP"]/*' />
     public sealed class AveragedPerceptronTrainer :
         AveragedLinearTrainer<AveragedPerceptronTrainer.Arguments, LinearBinaryPredictor>
     {
@@ -94,7 +94,7 @@ public override LinearBinaryPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name=""AP""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name=""AP""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
index f345466e19..4e6bd89b8f 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
@@ -27,7 +27,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TPredictor = LinearRegressionPredictor;
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="OGD"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="OGD"]/*' />
     public sealed class OnlineGradientDescentTrainer : AveragedLinearTrainer<OnlineGradientDescentTrainer.Arguments, TPredictor>
     {
         internal const string LoadNameValue = "OnlineGradientDescent";
@@ -94,7 +94,7 @@ public override TPredictor CreatePredictor()
             Desc = "Train a Online gradient descent perceptron.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name=""OGD""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name=""OGD""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
index 1ab7647c4f..c6cd70789f 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
     
     <member name="OGD">
@@ -41,4 +41,4 @@
     </member>
 
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index 9322c2cc75..356c06b07d 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="PoissonRegression"]/*' />
     public sealed class PoissonRegression : LbfgsTrainerBase<Float, PoissonRegressionPredictor>
     {
         internal const string LoadNameValue = "PoissonRegression";
@@ -129,7 +129,7 @@ protected override void ProcessPriorDistribution(Float label, Float weight)
             Desc = "Train an Poisson regression model.", 
             UserName = UserNameValue, 
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='docs/members/member[@name=""PoissonRegression""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/member[@name=""PoissonRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
index 4d2aeec579..9e7f4eee38 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
    
     <member name="PoissonRegression">
@@ -14,4 +14,4 @@
     </member>
    
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
index 20bc349a7c..a256f7ae79 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
@@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.Learners
     using TVectorPredictor = IPredictorProducing<VBuffer<Float>>;
 
     // SDCA linear multiclass trainer.
-    /// <include file='./doc.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="SDCA"]/*' />
     public class SdcaMultiClassTrainer : SdcaTrainerBase<TVectorPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAMC";
@@ -389,7 +389,7 @@ public static partial class Sdca
             Desc = SdcaMultiClassTrainer.Summary,
             UserName = SdcaMultiClassTrainer.UserNameValue,
             ShortName = SdcaMultiClassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
index 512818bba7..554603a7d9 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
@@ -25,7 +25,7 @@ namespace Microsoft.ML.Runtime.Learners
 {
     using TScalarPredictor = IPredictorWithFeatureWeights<Float>;
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="SDCA"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="SDCA"]/*' />
     public sealed class SdcaRegressionTrainer : SdcaTrainerBase<IPredictor>, ITrainer<RoleMappedData, TScalarPredictor>, ITrainerEx
     {
         public const string LoadNameValue = "SDCAR";
@@ -136,7 +136,7 @@ public static partial class Sdca
             Desc = SdcaRegressionTrainer.Summary,
             UserName = SdcaRegressionTrainer.UserNameValue,
             ShortName = SdcaRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
index 0b4336a96e..e435fc154d 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
     
     <member name="SDCA">
@@ -27,4 +27,4 @@
     </member>
  
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
index f400d92a93..42d1f4d310 100644
--- a/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalHashTransform.cs
@@ -19,7 +19,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
     public static class CategoricalHashTransform
     {
         public const int NumBitsLim = 31; // can't convert 31-bit hashes to indicator vectors, so max is 30
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index fc1382901b..40dd657af7 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -21,7 +21,7 @@
 [assembly: LoadableClass(typeof(void), typeof(Categorical), null, typeof(SignatureEntryPointModule), "Categorical")]
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="CategoricalOneHotVectorizer"]/*' />
     public static class CategoricalTransform
     {
         public enum OutputKind : byte
@@ -246,7 +246,7 @@ public static class Categorical
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", 
             Desc = CategoricalTransform.Summary,
             UserName = CategoricalTransform.UserName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, CategoricalTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -261,7 +261,7 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", 
             Desc = CategoricalHashTransform.Summary,
             UserName = CategoricalHashTransform.UserName ,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, CategoricalHashTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -273,7 +273,10 @@ public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment en
             return new CommonOutputs.TransformOutput { Model = new TransformModel(env, xf, input.Data), OutputData = xf };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.TextToKeyConverter", Desc = TermTransform.Summary, UserName = TermTransform.UserName)]
+        [TlcModule.EntryPoint(Name = "Transforms.TextToKeyConverter",
+            Desc = TermTransform.Summary, 
+            UserName = TermTransform.UserName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""TextToKey""]/*' />" })]
         public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, TermTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -285,7 +288,10 @@ public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, Term
             return new CommonOutputs.TransformOutput { Model = new TransformModel(env, xf, input.Data), OutputData = xf };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.KeyToTextConverter", Desc = "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", UserName = KeyToValueTransform.UserName)]
+        [TlcModule.EntryPoint(Name = "Transforms.KeyToTextConverter", 
+            Desc = "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", 
+            UserName = KeyToValueTransform.UserName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""KeyToText""]/*' />" })]
         public static CommonOutputs.TransformOutput KeyToText(IHostEnvironment env, KeyToValueTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
index d6de490d52..8eba9a2743 100644
--- a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
+++ b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
@@ -17,11 +17,15 @@ namespace Microsoft.ML.Runtime.Transforms
     /// </summary>
     public static class TextAnalytics
     {
-        [TlcModule.EntryPoint(Name = "Transforms.TextFeaturizer", Desc = Data.TextTransform.Summary, UserName = Data.TextTransform.UserName, ShortName = Data.TextTransform.LoaderSignature)]
+        [TlcModule.EntryPoint(Name = "Transforms.TextFeaturizer", 
+            Desc = Data.TextTransform.Summary, 
+            UserName = Data.TextTransform.UserName, 
+            ShortName = Data.TextTransform.LoaderSignature,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""TextTransform""]/*' />" })]
         public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TextTransform", input);
-            var xf = Microsoft.ML.Runtime.Data.TextTransform.Create(h, input, input.Data);
+            var xf = Data.TextTransform.Create(h, input, input.Data);
             return new CommonOutputs.TransformOutput()
             {
                 Model = new TransformModel(h, xf, input.Data),
@@ -29,8 +33,11 @@ public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env,
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.WordTokenizer", Desc = Data.DelimitedTokenizeTransform.Summary,
-            UserName = Data.DelimitedTokenizeTransform.UserName, ShortName = Data.DelimitedTokenizeTransform.LoaderSignature)]
+        [TlcModule.EntryPoint(Name = "Transforms.WordTokenizer", 
+            Desc = Data.DelimitedTokenizeTransform.Summary,
+            UserName = Data.DelimitedTokenizeTransform.UserName, 
+            ShortName = Data.DelimitedTokenizeTransform.LoaderSignature,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordTokenizer""]/*' />" })]
         public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvironment env, DelimitedTokenizeTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "DelimitedTokenizeTransform", input);
@@ -42,7 +49,11 @@ public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvi
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.NGramTranslator", Desc = Data.NgramTransform.Summary, UserName = Data.NgramTransform.UserName, ShortName = Data.NgramTransform.LoaderSignature)]
+        [TlcModule.EntryPoint(Name = "Transforms.NGramTranslator", 
+            Desc = NgramTransform.Summary, 
+            UserName = NgramTransform.UserName, 
+            ShortName = NgramTransform.LoaderSignature,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""NgramTranslator""]/*' />" })]
         public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, NgramTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NGramTransform", input);
@@ -54,7 +65,10 @@ public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env,
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.Dictionarizer", Desc = Data.TermTransform.Summary, UserName = Data.TermTransform.UserName, ShortName = Data.TermTransform.LoaderSignature)]
+        [TlcModule.EntryPoint(Name = "Transforms.Dictionarizer", 
+            Desc = Data.TermTransform.Summary, 
+            UserName = Data.TermTransform.UserName, 
+            ShortName = Data.TermTransform.LoaderSignature)]
         public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, TermTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TermTransform", input);
@@ -66,7 +80,11 @@ public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env,
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.SentimentAnalyzer", Desc = "Uses a pretrained sentiment model to score input strings", UserName = SentimentAnalyzingTransform.UserName, ShortName = SentimentAnalyzingTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.SentimentAnalyzer", 
+            Desc = "Uses a pretrained sentiment model to score input strings", 
+            UserName = SentimentAnalyzingTransform.UserName, 
+            ShortName = SentimentAnalyzingTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""SentimentAnalyzer""]/*' />" })]
         public static CommonOutputs.TransformOutput AnalyzeSentiment(IHostEnvironment env, SentimentAnalyzingTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "SentimentAnalyzer", input);
@@ -78,7 +96,11 @@ public static CommonOutputs.TransformOutput AnalyzeSentiment(IHostEnvironment en
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.CharacterTokenizer", Desc = CharTokenizeTransform.Summary, UserName = CharTokenizeTransform.UserName, ShortName = CharTokenizeTransform.LoaderSignature)]
+        [TlcModule.EntryPoint(Name = "Transforms.CharacterTokenizer", 
+            Desc = CharTokenizeTransform.Summary, 
+            UserName = CharTokenizeTransform.UserName, 
+            ShortName = CharTokenizeTransform.LoaderSignature,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""CharacterTokenizer""]/*' />" })]
         public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, CharTokenizeTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -93,7 +115,11 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, C
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.LightLda", Desc = LdaTransform.Summary, UserName = LdaTransform.UserName, ShortName = LdaTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.LightLda", 
+            Desc = LdaTransform.Summary, 
+            UserName = LdaTransform.UserName, 
+            ShortName = LdaTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""LightLDA""]/*' />" })]
         public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs
index a7a69ff1f4..fd67e5fca6 100644
--- a/src/Microsoft.ML.Transforms/GcnTransform.cs
+++ b/src/Microsoft.ML.Transforms/GcnTransform.cs
@@ -38,7 +38,7 @@ namespace Microsoft.ML.Runtime.Data
     ///    Performs the following operation on a vector X:
     ///         Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.
     ///    Usage examples and Matlab code:
-    ///    <see href="http://www.cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf"/>
+    ///    <a href="http://www.cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf">http://www.cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf</a>.
     /// </summary>
     public sealed class LpNormNormalizerTransform : OneToOneTransformBase
     {
@@ -666,7 +666,11 @@ private static Float Mean(Float[] src, int count, int length)
 
     public static class LpNormalization
     {
-        [TlcModule.EntryPoint(Name = "Transforms.LpNormalizer", Desc = LpNormNormalizerTransform.Summary, UserName = LpNormNormalizerTransform.UserNameLP, ShortName = LpNormNormalizerTransform.ShortNameLP)]
+        [TlcModule.EntryPoint(Name = "Transforms.LpNormalizer", 
+            Desc = LpNormNormalizerTransform.Summary, 
+            UserName = LpNormNormalizerTransform.UserNameLP, 
+            ShortName = LpNormNormalizerTransform.ShortNameLP,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""LpNormalize""]/*' />" })]
         public static CommonOutputs.TransformOutput Normalize(IHostEnvironment env, LpNormNormalizerTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LpNormalize", input);
@@ -678,7 +682,11 @@ public static CommonOutputs.TransformOutput Normalize(IHostEnvironment env, LpNo
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.GlobalContrastNormalizer", Desc = LpNormNormalizerTransform.GcnSummary, UserName = LpNormNormalizerTransform.UserNameGn, ShortName = LpNormNormalizerTransform.ShortNameGn)]
+        [TlcModule.EntryPoint(Name = "Transforms.GlobalContrastNormalizer", 
+            Desc = LpNormNormalizerTransform.GcnSummary, 
+            UserName = LpNormNormalizerTransform.UserNameGn, 
+            ShortName = LpNormNormalizerTransform.ShortNameGn,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""GcNormalize""]/*' />" })]
         public static CommonOutputs.TransformOutput GcNormalize(IHostEnvironment env, LpNormNormalizerTransform.GcnArguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "GcNormalize", input);
diff --git a/src/Microsoft.ML.Transforms/GroupTransform.cs b/src/Microsoft.ML.Transforms/GroupTransform.cs
index 1d5b823278..e19167c55a 100644
--- a/src/Microsoft.ML.Transforms/GroupTransform.cs
+++ b/src/Microsoft.ML.Transforms/GroupTransform.cs
@@ -652,7 +652,11 @@ public ValueGetter<TValue> GetGetter<TValue>(int col)
 
     public static partial class GroupingOperations
     {
-        [TlcModule.EntryPoint(Name = "Transforms.CombinerByContiguousGroupId", Desc = GroupTransform.Summary, UserName = GroupTransform.UserName, ShortName = GroupTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.CombinerByContiguousGroupId", 
+            Desc = GroupTransform.Summary, 
+            UserName = GroupTransform.UserName, 
+            ShortName = GroupTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""Group""]/*' />" })]
         public static CommonOutputs.TransformOutput Group(IHostEnvironment env, GroupTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/HashJoinTransform.cs b/src/Microsoft.ML.Transforms/HashJoinTransform.cs
index 25adc7858f..f5fb3a6f71 100644
--- a/src/Microsoft.ML.Transforms/HashJoinTransform.cs
+++ b/src/Microsoft.ML.Transforms/HashJoinTransform.cs
@@ -166,7 +166,7 @@ private static VersionInfo GetVersionInfo()
 
         private readonly ColumnInfoEx[] _exes;
 
-        /// <include file='doc.xml' path='docs/members/member[@name="HashJoin"]/*' />
+        /// <include file='doc.xml' path='doc/members/member[@name="HashJoin"]/*' />
         public HashJoinTransform(IHostEnvironment env, Arguments args, IDataView input)
             : base(env, RegistrationName, Contracts.CheckRef(args, nameof(args)).Column, input, TestColumnType)
         {
@@ -679,7 +679,7 @@ public static class HashJoin
             Desc = HashJoinTransform.Summary, 
             UserName = HashJoinTransform.UserName, 
             ShortName = HashJoinTransform.RegistrationName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""HashJoin""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""HashJoin""]/*' />" })]
         public static CommonOutputs.TransformOutput Apply(IHostEnvironment env, HashJoinTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/NADropTransform.cs b/src/Microsoft.ML.Transforms/NADropTransform.cs
index a87386e12a..119e868913 100644
--- a/src/Microsoft.ML.Transforms/NADropTransform.cs
+++ b/src/Microsoft.ML.Transforms/NADropTransform.cs
@@ -21,7 +21,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='doc.xml' path='docs/members/member[@name="NADrop"]'/>
+    /// <include file='doc.xml' path='doc/members/member[@name="NADrop"]'/>
     public sealed class NADropTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
diff --git a/src/Microsoft.ML.Transforms/NAHandling.cs b/src/Microsoft.ML.Transforms/NAHandling.cs
index 3ab3e7816d..992b2998d0 100644
--- a/src/Microsoft.ML.Transforms/NAHandling.cs
+++ b/src/Microsoft.ML.Transforms/NAHandling.cs
@@ -15,7 +15,7 @@ public static class NAHandling
             Desc = NADropTransform.Summary,
             UserName = NADropTransform.FriendlyName, 
             ShortName = NADropTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""NADrop""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NADrop""]/*' />" })]
         public static CommonOutputs.TransformOutput Drop(IHostEnvironment env, NADropTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, NADropTransform.ShortName, input);
@@ -63,7 +63,7 @@ public static CommonOutputs.TransformOutput Handle(IHostEnvironment env, NAHandl
             Desc = NAIndicatorTransform.Summary, 
             UserName = NAIndicatorTransform.FriendlyName, 
             ShortName = NAIndicatorTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""NAIndicator""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NAIndicator""]/*' />" })]
         public static CommonOutputs.TransformOutput Indicator(IHostEnvironment env, NAIndicatorTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAIndicator", input);
@@ -75,7 +75,11 @@ public static CommonOutputs.TransformOutput Indicator(IHostEnvironment env, NAIn
             };
         }
 
-        [TlcModule.EntryPoint(Name = "Transforms.MissingValueSubstitutor", Desc = NAReplaceTransform.Summary, UserName = NAReplaceTransform.FriendlyName, ShortName = NAReplaceTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.MissingValueSubstitutor", 
+            Desc = NAReplaceTransform.Summary, 
+            UserName = NAReplaceTransform.FriendlyName, 
+            ShortName = NAReplaceTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NAReplace""]/*' />" })]
         public static CommonOutputs.TransformOutput Replace(IHostEnvironment env, NAReplaceTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAReplace", input);
diff --git a/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs b/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs
index 82d79203dd..18029f6cb4 100644
--- a/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs
+++ b/src/Microsoft.ML.Transforms/NAIndicatorTransform.cs
@@ -21,7 +21,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='doc.xml' path='docs/members/member[@name="NAIndicator"]'/>
+    /// <include file='doc.xml' path='doc/members/member[@name="NAIndicator"]'/>
     public sealed class NAIndicatorTransform : OneToOneTransformBase
     {
         public sealed class Column : OneToOneColumn
diff --git a/src/Microsoft.ML.Transforms/NAReplaceTransform.cs b/src/Microsoft.ML.Transforms/NAReplaceTransform.cs
index 44832ee517..367e9fe491 100644
--- a/src/Microsoft.ML.Transforms/NAReplaceTransform.cs
+++ b/src/Microsoft.ML.Transforms/NAReplaceTransform.cs
@@ -27,13 +27,14 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// This transform can transform either scalars or vectors (both fixed and variable size),
-    /// creating output columns that are identical to the input columns except for replacing NA values
-    /// with either the default value, user input, or imputed values (min/max/mean are currently supported).
-    /// Imputation modes are supported for vectors both by slot and across all slots.
-    /// </summary>
-    /// REVIEW: May make sense to implement the transform template interface.
+    // <summary>
+    // This transform can transform either scalars or vectors (both fixed and variable size),
+    // creating output columns that are identical to the input columns except for replacing NA values
+    // with either the default value, user input, or imputed values (min/max/mean are currently supported).
+    // Imputation modes are supported for vectors both by slot and across all slots.
+    // </summary>
+    // REVIEW: May make sense to implement the transform template interface.
+    /// <include file='doc.xml' path='doc/members/member[@name="NAReplace"]/*' />
     public sealed partial class NAReplaceTransform : OneToOneTransformBase
     {
         public enum ReplacementKind
diff --git a/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs b/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
index dc1cda3ace..d03245c35e 100644
--- a/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
+++ b/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
@@ -26,7 +26,7 @@
 
 namespace Microsoft.ML.Runtime.DataPipe
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="OptionalColumnTransform"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="OptionalColumnTransform"]/*' />
     public class OptionalColumnTransform : RowToRowMapperTransformBase
     {
         public sealed class Arguments : TransformInputBase
@@ -464,7 +464,7 @@ private Delegate MakeGetterVec<T>(int length)
             Name = "Transforms.OptionalColumnCreator", 
             UserName = UserName, 
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name=""OptionalColumnTransform""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""OptionalColumnTransform""]/*' />" })]
 
         public static CommonOutputs.TransformOutput MakeOptional(IHostEnvironment env, Arguments input)
         {
diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
index 588e0a86f1..7a9b214063 100644
--- a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
+++ b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs
@@ -26,23 +26,22 @@
 
 namespace Microsoft.ML.Runtime.TextAnalytics
 {
-    /// <summary>
-    /// LightLDA transform: Big Topic Models on Modest Compute Clusters.
-    /// <see href="http://arxiv.org/abs/1412.1576">LightLDA</see> is an implementation of Latent Dirichlet Allocation (LDA).
-    /// Previous implementations of LDA such as SparseLDA or AliasLDA allow to achieve massive data and model scales,
-    /// for example models with tens of billions of parameters to be inferred from billions of documents.
-    /// However this requires using a cluster of thousands of machines with all ensuing costs to setup and maintain.
-    /// LightLDA solves this problem in a more cost-effective manner by providing an implementation 
-    /// that is efﬁcient enough for modest clusters with at most tens of machines... 
-    /// For more details please see original LightLDA paper: 
-    /// http://arxiv.org/abs/1412.1576
-    /// http://www.www2015.it/documents/proceedings/proceedings/p1351.pdf
-    /// and open source implementation: 
-    /// https://github.com/Microsoft/LightLDA
-    /// 
-    /// See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs"/>
-    /// for an example on how to use LdaTransform.
-    /// </summary>
+    // LightLDA transform: Big Topic Models on Modest Compute Clusters.
+    // <a href="http://arxiv.org/abs/1412.1576">LightLDA</a> is an implementation of Latent Dirichlet Allocation (LDA).
+    // Previous implementations of LDA such as SparseLDA or AliasLDA allow to achieve massive data and model scales,
+    // for example models with tens of billions of parameters to be inferred from billions of documents.
+    // However this requires using a cluster of thousands of machines with all ensuing costs to setup and maintain.
+    // LightLDA solves this problem in a more cost-effective manner by providing an implementation 
+    // that is efﬁcient enough for modest clusters with at most tens of machines... 
+    // For more details please see original LightLDA paper: 
+    // http://arxiv.org/abs/1412.1576
+    // http://www.www2015.it/documents/proceedings/proceedings/p1351.pdf
+    // and open source implementation: 
+    // https://github.com/Microsoft/LightLDA
+    // 
+    // See <a href="https://github.com/dotnet/machinelearning/blob/master/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs"/>
+    // for an example on how to use LdaTransform.
+    /// <include file='doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
     public sealed class LdaTransform : OneToOneTransformBase
     {
         public sealed class Arguments : TransformInputBase
diff --git a/src/Microsoft.ML.Transforms/Text/SentimentAnalyzerTransform.cs b/src/Microsoft.ML.Transforms/Text/SentimentAnalyzerTransform.cs
index 7ff3d84d10..f3471e09e0 100644
--- a/src/Microsoft.ML.Transforms/Text/SentimentAnalyzerTransform.cs
+++ b/src/Microsoft.ML.Transforms/Text/SentimentAnalyzerTransform.cs
@@ -17,6 +17,7 @@
 
 namespace Microsoft.ML.Runtime.TextAnalytics
 {
+    /// <include file='doc.xml' path='doc/members/member[@name="SentimentAnalyzer"]/*' />
     public static class SentimentAnalyzingTransform
     {
         public sealed class Arguments : TransformInputBase
diff --git a/src/Microsoft.ML.Transforms/Text/TextTransform.cs b/src/Microsoft.ML.Transforms/Text/TextTransform.cs
index b2a34a1126..932bf63272 100644
--- a/src/Microsoft.ML.Transforms/Text/TextTransform.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextTransform.cs
@@ -24,11 +24,10 @@ namespace Microsoft.ML.Runtime.Data
     using StopWordsLang = StopWordsRemoverTransform.Language;
     using CaseNormalizationMode = TextNormalizerTransform.CaseNormalizationMode;
 
-    /// <summary>
-    /// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts 
-    /// of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
-    /// integer index mapping through hashing) as an option.
-    /// </summary>
+    // A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts 
+    // of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
+    // integer index mapping through hashing) as an option.
+    /// <include file='doc.xml' path='doc/members/member[@name="TextTransform"]/*' />
     public static class TextTransform
     {
         /// <summary>
diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs
index 60500d33cc..5afd177763 100644
--- a/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs
+++ b/src/Microsoft.ML.Transforms/Text/WordTokenizeTransform.cs
@@ -35,11 +35,10 @@ public interface ITokenizeTransform : IDataTransform
     {
     }
 
-    /// <summary>
-    /// The input for this transform is a DvText or a vector of DvTexts, and its output is a vector of DvTexts,
-    /// corresponding to the tokens in the input text, split using a set of user specified separator characters.
-    /// Empty strings and strings containing only spaces are dropped.
-    /// </summary>
+    // The input for this transform is a DvText or a vector of DvTexts, and its output is a vector of DvTexts,
+    // corresponding to the tokens in the input text, split using a set of user specified separator characters.
+    // Empty strings and strings containing only spaces are dropped.
+    /// <include file='doc.xml' path='doc/members/member[@name="WordTokenizer"]/*' />
     public sealed class DelimitedTokenizeTransform : OneToOneTransformBase, ITokenizeTransform
     {
         public class Column : OneToOneColumn
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
new file mode 100644
index 0000000000..5e4ffead3f
--- /dev/null
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -0,0 +1,185 @@
+﻿<?xml version="1.0" encoding="utf-8" ?>
+<doc>
+  <members>
+
+    <member name="TextTransform">
+      <summary>
+        A transform that turns a collection of text documents into numerical feature vectors.
+        The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.
+      </summary>
+      <remarks>
+        The TextFeaturizer transform gives user one-stop solution for doing:
+        <list type="bullet">
+          <item>
+            <description>Language Detection</description>
+          </item>
+          <item>
+            <description>Tokenzation​</description>
+          </item>
+          <item>
+            <description>Text normalization</description>
+          </item>
+          <item>
+            <description>Predefined and custom stopwords removal.</description>
+          </item>
+          <item>
+            <description>Word-based or character-based Ngram and SkipGram extraction.​</description>
+          </item>
+          <item>
+            <description>TF, IDF or TF-IDF.</description>
+          </item>
+          <item>
+            <description>L-p vector normalization.​</description>
+          </item>
+        </list>
+        The TextFeaturizer will show the transformed text, after being applied.
+        It converts a collection of text columns to a matrix of token  ngrams/skip-grams counts.
+        Features are made of (word/character) n-grams/skip-grams​ and the number of features are equal to the vocabulary size found by analyzing the data.
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new TextFeaturizer(&quot;Features&quot;, &quot;SentimentText&quot;)
+          {
+            KeepDiacritics = false,
+            KeepPunctuations = false,
+            TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
+            OutputTokens = true,
+            StopWordsRemover = new PredefinedStopWordsRemover(),
+            VectorNormalizer = TextTransformTextNormKind.L2,
+            CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
+            WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
+          });
+        </code>
+      </example>
+    </member>
+
+    <member name="WordTokenizer">
+      <summary>
+        The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. 
+        The separator is space, but can be specified as any other character (or multiple characters) if needed.
+      </summary>
+      <remarks>
+        The input for this transform is a <see cref="Microsoft.ML.Runtime.Data.DvText">DvText</see> or a vector of <see cref="Microsoft.ML.Runtime.Data.DvText">DvTexts</see>,
+        and its output is a vector of DvTexts, corresponding to the tokens in the input text.
+        The output is generated by splitting the input text, using a set of user specified separator characters.
+        Empty strings and strings containing only spaces are dropped.
+        This transform is not typically used on its own, but it is one of the transforms composing the Text Featurizer.
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add( new WordTokenizer(&quot;TextColumn&quot;){ TermSeparators = &quot;&apos; &apos;, &apos;\t&apos;, &apos;;&apos;&quot;  } );
+        </code>
+      </example>
+    </member>
+
+    <member name="NgramTranslator">
+      <summary>
+        This transform produces a bag of counts of n-grams (sequences of consecutive values of length 1-n) in a given vector of keys. 
+        It does so by building a dictionary of n-grams and using the id in the dictionary as the index in the bag.
+      </summary>
+      <remarks>
+        This transform produces a matrix of token ngrams/skip-grams counts for a given corpus of text.
+        The n-grams are represented as count vectors, with vector slots corresponding to n-grams.
+        Embedding ngrams in a vector space allows their contents to be compared in an efficient manner. 
+        The slot values in the vector can be weighted by the following factors:
+        <list>
+          <item>
+            <description>term frequency - The number of occurrences of the slot in the text</description>
+          </item>
+          <item>
+            <description>
+              inverse document frequency - A ratio (the logarithm of inverse relative slot frequency)
+              that measures the information a slot provides by determining how common or rare it is across the entire text.
+            </description>
+          </item>
+            <item>
+              <description>term frequency-inverse document frequency - the product term frequency and the inverse document frequency.</description>
+            </item>
+        </list>
+        This transform is not typically used on its own, but it is one of the transforms composing the <see cref="Microsoft.ML.Transforms.TextFeaturizer">Text Featurizer</see> .
+      </remarks>
+      <seealso cref="Microsoft.ML.Transforms.WordTokenizer"/>
+      <seealso cref="Microsoft.ML.Transforms.TextToKey"/>
+      <seealso cref="Microsoft.ML.Transforms.TextFeaturizer"/>
+      <seealso cref="Microsoft.ML.Transforms.CharacterTokenizer"/>
+      <example>
+        <code>
+          pipeline.Add(new NGramTranslator(&quot;TextColumn&quot;){ Weighting=NgramTransformWeightingCriteria.TfIdf  } );
+      </code>
+      </example>
+    </member>
+
+    <member name="SentimentAnalyzer">
+      <summary>
+        Uses a pretrained sentiment model to score input strings.
+      </summary>
+      <remarks>
+        <para>The Sentiment transform returns the probability that the sentiment of a natural text is positive. </para>
+        <para>
+          The model was trained with the <a href="http://anthology.aclweb.org/P/P14/P14-1146.pdf">Sentiment-specific word embedding (SSWE)</a>  and NGramFeaturizer on Twitter sentiment data,
+          similarly to the sentiment analysis part of the
+          <a href="https://www.microsoft.com/cognitive-services/en-us/text-analytics-api">Text Analytics cognitive service</a>. 
+          The transform outputs a score between 0 and 1 as a sentiment prediction 
+          (where 0 is a negative sentiment and 1 is a positive sentiment).</para> 
+          <para>Currently it supports only English.</para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new SentimentAnalyzer(){ Source = &quot;TextColumn&quot; }  );
+        </code>
+      </example>
+    </member>
+
+    <member name="CharacterTokenizer">
+      <summary>
+        Character-oriented tokenizer where text is considered a sequence of characters. 
+      </summary>
+      <remarks>
+      This transform is not typically used on its own, but it is one of the transforms composing the 
+      <see cref="Microsoft.ML.Transforms.TextFeaturizer">Text Featurizer</see>. 
+      </remarks>
+      <seealso cref="Microsoft.ML.Transforms.WordTokenizer"/>
+      <seealso cref="Microsoft.ML.Transforms.TextToKey"/>
+      <seealso cref="Microsoft.ML.Transforms.NGramTranslator"/>
+      <seealso cref="Microsoft.ML.Transforms.TextFeaturizer"/>
+      <example>
+        <code>
+          pipeline.Add(new CharacterTokenizer("TextCol1" , "TextCol2" ) );
+        </code>
+      </example>
+    </member>
+
+    <member name="LightLDA">
+      <summary>
+        The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.
+      </summary>
+      <remarks>
+        Latent Dirichlet Allocation is a well-known topic modeling algorithm that infers topical structure from text data,
+        and can be used to featurize any text fields as low-dimensional topical vectors. 
+        <para>LightLDA is an extremely efficient implementation of LDA developed in MSR-Asia that incorporates a number of 
+         optimization techniques. See <a href="http://arxiv.org/abs/1412.1576">LightLDA: Big Topic Models on Modest Compute Clusters</a>.
+        </para>
+        <para>
+          With the LDA transform, ML.NET users can train a topic model to produce 1 million topics with 1 million vocabulary
+          on a 1-billion-token document set one a single machine in a few hours (typically, LDA at this scale takes days and requires large clusters).
+          The most significant innovation is a super-efficient O(1) <a href="https://en.wikipedia.org/wiki/Metropolis–Hastings_algorithm">Metropolis-Hastings sampling algorithm</a>,
+          whose running cost is (surprisingly) agnostic of model size,
+          allowing it to converges nearly an order of magnitude faster than other <a href="https://en.wikipedia.org/wiki/Gibbs_sampling">Gibbs samplers.</a>
+        </para>
+        <para>
+          For more details please see original LightLDA paper, and its open source implementation. 
+          <list>
+            <item><description><a href="http://arxiv.org/abs/1412.1576"> LightLDA: Big Topic Models on Modest Computer Clusters</a></description></item>
+            <item><description><a href=" https://github.com/Microsoft/LightLDA">LightLDA </a></description></item>
+          </list>
+        </para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new LightLda(("InTextCol" , "OutTextCol")));
+        </code>
+      </example>
+    </member>
+
+  </members>
+</doc>
diff --git a/src/Microsoft.ML.Transforms/UngroupTransform.cs b/src/Microsoft.ML.Transforms/UngroupTransform.cs
index cb97e1c3b3..387320da81 100644
--- a/src/Microsoft.ML.Transforms/UngroupTransform.cs
+++ b/src/Microsoft.ML.Transforms/UngroupTransform.cs
@@ -22,28 +22,28 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// This can be thought of as an inverse of <see cref="GroupTransform"/>. For all specified vector columns 
-    /// ("pivot" columns), performs the "ungroup" (or "unroll") operation as outlined below.
-    /// 
-    /// If the only pivot column is called P, and has size K, then for every row of the input we will produce 
-    /// K rows, that are identical in all columns except P. The column P will become a scalar column, and this 
-    /// column will hold all the original values of input's P, one value per row, in order. The order of columns 
-    /// will remain the same.
-    /// 
-    /// Variable-length pivot columns are supported (including zero, which will eliminate the row from the result).
-    /// 
-    /// Multiple pivot columns are also supported:
-    /// * A number of output rows is controlled by the 'mode' parameter. 
-    ///     - outer: it is equal to the maximum length of pivot columns,
-    ///     - inner: it is equal to the minimum length of pivot columns,
-    ///     - first: it is equal to the length of the first pivot column.
-    /// * If a particular pivot column has size that is different than the number of output rows, the extra slots will
-    /// be ignored, and the missing slots will be 'padded' with default values.
-    /// 
-    /// All metadata is preserved for the retained columns. For 'unrolled' columns, all known metadata
-    /// except slot names is preserved.
-    /// </summary>
+
+    // This can be thought of as an inverse of GroupTransform. For all specified vector columns 
+    // ("pivot" columns), performs the "ungroup" (or "unroll") operation as outlined below.
+    // 
+    // If the only pivot column is called P, and has size K, then for every row of the input we will produce 
+    // K rows, that are identical in all columns except P. The column P will become a scalar column, and this 
+    // column will hold all the original values of input's P, one value per row, in order. The order of columns 
+    // will remain the same.
+    // 
+    // Variable-length pivot columns are supported (including zero, which will eliminate the row from the result).
+    // 
+    // Multiple pivot columns are also supported:
+    // * A number of output rows is controlled by the 'mode' parameter. 
+    //     - outer: it is equal to the maximum length of pivot columns,
+    //     - inner: it is equal to the minimum length of pivot columns,
+    //     - first: it is equal to the length of the first pivot column.
+    // * If a particular pivot column has size that is different than the number of output rows, the extra slots will
+    // be ignored, and the missing slots will be 'padded' with default values.
+    // 
+    // All metadata is preserved for the retained columns. For 'unrolled' columns, all known metadata
+    // except slot names is preserved.
+    /// <include file='doc.xml' path='doc/members/member[@name="Ungroup"]/*' />
     public sealed class UngroupTransform : TransformBase
     {
         public const string Summary = "Un-groups vector columns into sequences of rows, inverse of Group transform";
@@ -627,7 +627,11 @@ private ValueGetter<T> MakeGetter<T>(int col, PrimitiveType itemType)
 
     public static partial class GroupingOperations
     {
-        [TlcModule.EntryPoint(Name = "Transforms.Segregator", Desc = UngroupTransform.Summary, UserName = UngroupTransform.UserName, ShortName = UngroupTransform.ShortName)]
+        [TlcModule.EntryPoint(Name = "Transforms.Segregator", 
+            Desc = UngroupTransform.Summary, 
+            UserName = UngroupTransform.UserName, 
+            ShortName = UngroupTransform.ShortName,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""Ungroup""]/*' />" })]
         public static CommonOutputs.TransformOutput Ungroup(IHostEnvironment env, UngroupTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/WhiteningTransform.cs b/src/Microsoft.ML.Transforms/WhiteningTransform.cs
index 2ae3824ba8..8998eb0d19 100644
--- a/src/Microsoft.ML.Transforms/WhiteningTransform.cs
+++ b/src/Microsoft.ML.Transforms/WhiteningTransform.cs
@@ -42,7 +42,7 @@ public enum WhiteningKind
     /// That is, PCA whitening is essentially just a PCA + rescale.
     /// ZCA whitening tries to make resulting data to look more like input data by rotating it back to the 
     /// original input space.
-    /// More information: <see href="http://ufldl.stanford.edu/wiki/index.php/Whitening"/>
+    /// More information: <a href="http://ufldl.stanford.edu/wiki/index.php/Whitening">http://ufldl.stanford.edu/wiki/index.php/Whitening</a>
     /// </summary>
     public sealed class WhiteningTransform : OneToOneTransformBase
     {
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index 9fd37cd278..20e463ecd0 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<docs>
+<doc>
   <members>
 
     <member name="CategoricalHashOneHotVectorizer">
@@ -55,8 +55,8 @@
       <remarks>
         <para>
           This transform uses a set of aggregators to count the number of non-default values for each slot and
-          instantiates a <see cref="Microsoft.ML.Runtime.Data.DropSlotsTramsform"/> to actually drop the slots.
-          This transform is useful when applied together with a <see cref="Microsoft.ML.Transforms.CategoricalHashOneHotVectorizer"/>. 
+          instantiates a <see cref="Microsoft.ML.Runtime.Data.DropSlotsTransform"/> to actually drop the slots.
+          This transform is useful when applied together with a CategoricalHashOneHotVectorizer. 
           The count feature selection can remove those features generated by the hash transform that have no data in the examples.
         </para>
       </remarks>
@@ -164,7 +164,6 @@
       </example>
     </member>
 
-
     <member name="NAIndicator">
       <summary>
         This transform can transform either scalars or vectors (both fixed and variable size),
@@ -177,6 +176,174 @@
         </code>
       </example>
     </member>
-  
+    
+    <member name="NAReplace">
+      <summary>
+        Create an output column of the same type and size of the input column, 
+        where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only). 
+      </summary>
+      <remarks>
+        This transform can transform either scalars or vectors (both fixed and variable size),
+        creating output columns that are identical to the input columns except for replacing NA values
+        with either the default value, user input, or imputed values (min/max/mean are currently supported).
+        Imputation modes are supported for vectors both by slot and across all slots.
+      </remarks>
+      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+      <example>
+        <code>
+          pipeline.Add(new MissingValueSubstitutor(&quot;FeatureCol&quot;){ ReplacementKind = NAReplaceTransformReplacementKind.Mean });
+        </code>
+      </example>
+    </member>
+    
+    <member name="LpNormalize">
+      <summary>
+         The LpNormalizer transforms, normalizes vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). 
+         <para>Performs the following operation on a vector X:</para> 
+         <para>Y = (X - M) / D</para> 
+         <para>where M is mean and D is either L2 norm, L1 norm or LInf norm.</para>
+       </summary>
+      <remarks>
+        Scaling inputs to unit norms is a common operation for text classification or clustering.
+        For more information see: <a href="http://www.cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf"></a>
+      </remarks>
+      <seealso cref=" Microsoft.ML.Transforms.GcNormalize"></seealso>
+      <example>
+        <code>
+          pipeline.Add(new LpNormalizer("FeatureCol"){ NormKind = LpNormNormalizerTransformNormalizerKind.L1Norm});
+        </code>
+      </example>
+    </member>
+    
+  <member name="GcNormalize">
+      <summary>
+        <para>Performs a global contrast normalization on input values:</para>
+        <para>Y = (s * X - M) / D</para> 
+        <para>where s is a scale, M is mean and D is either the L2 norm or standard deviation.</para>
+       </summary>
+      <remarks>
+        Scaling inputs to unit norms is a common operation for text classification or clustering.
+        For more information see: <a href="http://www.cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf"></a>
+      </remarks>
+      <seealso cref=" Microsoft.ML.Transforms.LpNormalizer"></seealso>
+      <example>
+        <code>
+          pipeline.Add(new GlobalContrastNormalizer(&quot;FeatureCol&quot;){ SubMean= false });
+        </code>
+      </example>
+    </member>
+    
+  <member name="Ungroup">
+      <summary>
+        Un-groups vector columns into sequences of rows, inverse of Group transform.
+       </summary>
+      <remarks>
+        <para>This can be thought of as an inverse of the CombinerByContiguousGroupId. 
+        For all specified vector columns ("pivot" columns), performs the "ungroup" (or "unroll") operation as outlined below.
+        </para>
+        <para>If the only pivot column is called P, and has size K, then for every row of the input we will produce 
+         K rows, that are identical in all columns except P. The column P will become a scalar column, and this 
+         column will hold all the original values of input's P, one value per row, in order. The order of columns 
+         will remain the same.
+        </para>
+        <para>Variable-length pivot columns are supported (including zero, which will eliminate the row from the result).</para>
+        <para>Multiple pivot columns are also supported:</para>
+        <list type="bullet">
+          <item>
+            <description>A number of output rows is controlled by the 'mode' parameter. 
+            <list type="bullet">
+              <item>
+                <description>outer: it is equal to the maximum length of pivot columns</description>
+                <description>inner: it is equal to the minimum length of pivot columns</description>
+                <description>first: it is equal to the length of the first pivot column</description>
+              </item>
+            </list>
+            </description>
+          </item>
+          <item>
+            <description>
+              If a particular pivot column has size that is different than the number of output rows, the extra slots will
+              be ignored, and the missing slots will be 'padded' with default values.
+            </description>
+          </item>
+        </list>
+        <para>All metadata is preserved for the retained columns. For 'unrolled' columns, all known metadata
+        except slot names is preserved.
+        </para>
+      </remarks>
+      <example>
+        <code>
+          pipeline.Add(new Segregator(){ Column = new[]{&quot;Column1&quot; }, Mode = UngroupTransformUngroupMode.First} );
+        </code>
+      </example>
+    </member>
+    
+    <member name="TextToKey">
+      <summary>
+       Converts input values (words, numbers, etc.) to index in a dictionary.
+      </summary>
+      <remarks>
+      The TextToKeyConverter transform builds up term vocabularies (dictionaries).
+      The TextToKey Converter and the <see cref="Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys. 
+      If multiple columns are used, each column builds/uses exactly one vocabulary (dictionary).
+      The output columns are KeyType-valued.
+      The Key value is the one-based index of the item in the dictionary.
+      If the key is not found in the dictionary, it is assigned the missing value indicator.
+      This dictionary mapping values to keys is most commonly learnt from the unique values in input data, 
+      but can be defined through other means: either with the mapping defined directly on the command line, or as loaded from an external file.
+      </remarks>
+      <seealso cref="Microsoft.ML.Transforms.HashConverter"/>
+      <seealso cref="Microsoft.ML.Transforms.KeyToTextConverter"/>
+      <example>
+        <code>
+          pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;)){ Sort = TermTransformSortOrder.Occurrence });
+        </code>
+      </example>
+    </member>
+    
+    <member name="KeyToText">
+      <summary>
+       The KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the 
+       KeyValues metadata.
+      </summary>
+      <remarks>
+        The KeyToTextConverter is the complement of the <see  cref="TextToKeyConverter"/> transform. 
+        Since key values are an enumeration into the set of keys, most transforms that produce key valued outputs 
+        corresponding to input values will often, wherever possible, associate a piece of KeyValue metadata with that dataset.
+        Transforming values into a categorical variable would be of limited use, 
+        if we couldn't somehow backtrack to figure out what those categories actually mean. 
+        The KeyToTextConverter enables that functionality. 
+      </remarks>
+      <seealso cref="Microsoft.ML.Transforms.HashConverter"/>
+      <seealso cref="Microsoft.ML.Transforms.TextToKeyConverter"/>
+      <example>
+        <code>
+          pipeline.Add(new KeyToTextConverter((&quot;InColumn&quot;, &quot;OutColumn&quot; )));
+        </code>
+      </example>
+    </member>
+    
+    <member name="Group">
+      <summary>
+       Groups values of a scalar column into a vector, by a contiguous group ID.
+      </summary>
+      <remarks>
+       The CombinerByContiguousGroupId transform groups the consecutive rows that share the specified group key (or keys). 
+       Both group keys and the aggregated values can be of arbitrary non-vector types. 
+       The resulting data will have all the group key columns preserved, 
+       and the aggregated columns will become variable-length vectors of the original types.
+       <para>This transform essentially performs the following SQL-like operation:</para> 
+       <para>GroupKey1, GroupKey2, ... GroupKeyK, LIST(Value1), LIST(Value2), ... LIST(ValueN)</para> 
+       <para>FROM Data</para> 
+       <para>GROUP BY GroupKey1, GroupKey2, ... GroupKeyK.</para> 
+      </remarks>
+       <seealso cref="Microsoft.ML.Transforms.Segregator"/>
+      <example>
+        <code>
+          pipeline.Add(new CombinerByContiguousGroupId(){ GroupKey = new []{"Key1", "Key2" } } );
+        </code>
+      </example>
+    </member>
+    
   </members>
-</docs>
\ No newline at end of file
+</doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index b190750ca1..73bc7ad988 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -3080,9 +3080,7 @@ public sealed partial class OneVersusAllMacroSubGraphOutput
 
         }
 
-        /// <summary>
-        /// One-vs-All macro (OVA)
-        /// </summary>
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="OVA"]'/>
         public sealed partial class OneVersusAll : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4095,7 +4093,7 @@ public sealed class Output
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name="AP"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name="AP"]/*' />
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4599,7 +4597,7 @@ public enum Bundle : byte
         }
 
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4890,7 +4888,7 @@ public FastForestBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastForest"]/*' />
         public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5177,7 +5175,7 @@ public enum BoostedTreeArgsOptimizationAlgorithmType
         }
 
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5566,7 +5564,7 @@ public FastTreeBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5990,7 +5988,7 @@ public FastTreeRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTree"]/*' />
         public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6374,7 +6372,7 @@ public FastTreeRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTreeTweedieRegression"]/*' />
         public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6763,7 +6761,7 @@ public FastTreeTweedieRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='docs/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
         public sealed partial class FieldAwareFactorizationMachineBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7190,7 +7188,7 @@ public enum KMeansPlusPlusTrainerInitAlgorithm
         }
 
 
-        /// <include file='../Microsoft.ML.KMeansClustering/doc.xml' path='docs/members/member[@name="KMeans++"]/*' />
+        /// <include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/member[@name="KMeans++"]/*' />
         public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7306,7 +7304,7 @@ public enum LightGbmArgumentsEvalMetricType
         }
 
 
-        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7509,7 +7507,7 @@ public LightGbmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7712,7 +7710,7 @@ public LightGbmClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7915,7 +7913,7 @@ public LightGbmRankerPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='docs/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
         public sealed partial class LightGbmRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8253,7 +8251,7 @@ public LinearSvmBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
         /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
@@ -8402,7 +8400,7 @@ public LogisticRegressionBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/member[@name="LBFGS"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
         /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
@@ -8551,9 +8549,7 @@ public LogisticRegressionClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <summary>
-        /// Train a MultiClassNaiveBayesTrainer.
-        /// </summary>
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="MultiClassNaiveBayesTrainer"]'/>
         public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8624,7 +8620,7 @@ public NaiveBayesClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='docs/members/member[@name="OGD"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name="OGD"]/*' />
         public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8777,7 +8773,7 @@ public OnlineGradientDescentRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name="PCA"]/*' />
         public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8871,7 +8867,7 @@ public PcaAnomalyDetectorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='docs/members/member[@name="PoissonRegression"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/member[@name="PoissonRegression"]/*' />
         public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9014,7 +9010,7 @@ public PoissonRegressorPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../../docs/code/xmlIncludes/Learners.xml' path='docs/members/member[@name="StochasticDualCoordinateAscentBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9153,7 +9149,7 @@ public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9276,7 +9272,7 @@ public StochasticDualCoordinateAscentClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='docs/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name="SDCA"]/*' />
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9879,7 +9875,7 @@ public sealed partial class CategoricalHashTransformColumn : OneToOneColumn<Cate
 
         }
 
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
         public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10052,7 +10048,7 @@ public sealed partial class CategoricalTransformColumn : OneToOneColumn<Categori
 
         }
 
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='docs/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CategoricalOneHotVectorizer"]/*' />
         public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10194,9 +10190,7 @@ public sealed partial class CharTokenizeTransformColumn : OneToOneColumn<CharTok
 
         }
 
-        /// <summary>
-        /// Character-oriented tokenizer where text is considered a sequence of characters.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="CharacterTokenizer"]/*' />
         public sealed partial class CharacterTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10787,9 +10781,7 @@ public ColumnTypeConverterPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// Groups values of a scalar column into a vector, by a contiguous group ID
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="Group"]/*' />
         public sealed partial class CombinerByContiguousGroupId : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11364,9 +11356,7 @@ public FeatureCombinerPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// Selects the slots for which the count of non-default values is greater than or equal to a threshold.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CountFeatureSelection"]'/>
         public sealed partial class FeatureSelectorByCount : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11434,9 +11424,7 @@ public FeatureSelectorByCountPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// Selects the top k slots across all specified columns ordered by their mutual information with the label column.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]'/>
         public sealed partial class FeatureSelectorByMutualInformation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11543,9 +11531,7 @@ public sealed partial class LpNormNormalizerTransformGcnColumn : OneToOneColumn<
 
         }
 
-        /// <summary>
-        /// Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="GcNormalize"]/*' />
         public sealed partial class GlobalContrastNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11702,9 +11688,7 @@ public sealed partial class HashJoinTransformColumn : OneToOneColumn<HashJoinTra
 
         }
 
-        /// <summary>
-        /// Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="HashJoin"]/*' />
         public sealed partial class HashConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11841,9 +11825,7 @@ public sealed partial class KeyToValueTransformColumn : OneToOneColumn<KeyToValu
 
         }
 
-        /// <summary>
-        /// KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="KeyToText"]/*' />
         public sealed partial class KeyToTextConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -12279,9 +12261,7 @@ public sealed partial class LdaTransformColumn : OneToOneColumn<LdaTransformColu
 
         }
 
-        /// <summary>
-        /// The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
         public sealed partial class LightLda : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -12616,9 +12596,7 @@ public sealed partial class LpNormNormalizerTransformColumn : OneToOneColumn<LpN
 
         }
 
-        /// <summary>
-        /// Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="LpNormalize"]/*' />
         public sealed partial class LpNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13035,9 +13013,7 @@ public sealed partial class NAHandleTransformColumn : OneToOneColumn<NAHandleTra
 
         }
 
-        /// <summary>
-        /// Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name="NAFilter"]/*' />
         public sealed partial class MissingValueHandler : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13169,9 +13145,7 @@ public sealed partial class NAIndicatorTransformColumn : OneToOneColumn<NAIndica
 
         }
 
-        /// <summary>
-        /// Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="NAIndicator"]/*' />
         public sealed partial class MissingValueIndicator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13288,9 +13262,7 @@ public sealed partial class NADropTransformColumn : OneToOneColumn<NADropTransfo
 
         }
 
-        /// <summary>
-        /// Removes NAs from vector columns.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="NADrop"]/*' />
         public sealed partial class MissingValuesDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13393,9 +13365,7 @@ public MissingValuesDropperPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// Filters out rows that contain missing values.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name="NAFilter"]/*' />
         public sealed partial class MissingValuesRowDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13501,9 +13471,7 @@ public sealed partial class NAReplaceTransformColumn : OneToOneColumn<NAReplaceT
 
         }
 
-        /// <summary>
-        /// Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="NAReplace"]/*' />
         public sealed partial class MissingValueSubstitutor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13689,9 +13657,7 @@ public sealed partial class NgramTransformColumn : OneToOneColumn<NgramTransform
 
         }
 
-        /// <summary>
-        /// Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="NgramTranslator"]/*' />
         public sealed partial class NGramTranslator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13879,9 +13845,7 @@ public NoOperationPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// If the source column does not exist after deserialization, create a column with the right type and default values.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="OptionalColumnTransform"]/*' />
         public sealed partial class OptionalColumnCreator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13983,7 +13947,7 @@ public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColu
 
         }
 
-        /// <include file='../Microsoft.ML.PCA/doc.xml' path='docs/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name="PCA"]/*' />
         public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14664,9 +14628,7 @@ public enum UngroupTransformUngroupMode
         }
 
 
-        /// <summary>
-        /// Un-groups vector columns into sequences of rows, inverse of Group transform
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="Ungroup"]/*' />
         public sealed partial class Segregator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14734,9 +14696,7 @@ public SegregatorPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// Uses a pretrained sentiment model to score input strings
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="SentimentAnalyzer"]/*' />
         public sealed partial class SentimentAnalyzer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14993,9 +14953,7 @@ public sealed partial class TermLoaderArguments
 
         }
 
-        /// <summary>
-        /// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="TextTransform"]/*' />
         public sealed partial class TextFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -15130,9 +15088,7 @@ public TextFeaturizerPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <summary>
-        /// Converts input values (words, numbers, etc.) to index in a dictionary.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="TextToKey"]/*' />
         public sealed partial class TextToKeyConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -15297,9 +15253,7 @@ public sealed class Output
     namespace Transforms
     {
 
-        /// <summary>
-        /// Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.
-        /// </summary>
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="TreeEnsembleFeaturizerTransform"]'/>
         public sealed partial class TreeLeafFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IFeaturizerInput, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -15423,9 +15377,7 @@ public sealed partial class DelimitedTokenizeTransformColumn : OneToOneColumn<De
 
         }
 
-        /// <summary>
-        /// The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
-        /// </summary>
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordTokenizer"]/*' />
         public sealed partial class WordTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
diff --git a/src/Microsoft.ML/Models/OnnxConverter.cs b/src/Microsoft.ML/Models/OnnxConverter.cs
index dd8ca383fd..8d4c48eff3 100644
--- a/src/Microsoft.ML/Models/OnnxConverter.cs
+++ b/src/Microsoft.ML/Models/OnnxConverter.cs
@@ -10,7 +10,7 @@ namespace Microsoft.ML.Models
     public sealed partial class OnnxConverter
     {
         /// <summary>
-        /// <see href="https://onnx.ai/">ONNX</see> is an intermediate representation format 
+        /// <a href="https://onnx.ai/">ONNX</a> is an intermediate representation format 
         /// for machine learning models. It is used to make models portable such that you can 
         /// train a model using a toolkit and run it in another tookit's runtime, for example,
         /// you can create a model using ML.NET (or any ONNX compatible toolkit), convert it to ONNX and 
diff --git a/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs b/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
index 14a56da596..494fe6b225 100644
--- a/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
+++ b/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
@@ -136,7 +136,7 @@ private static int GetNumberOfClasses(IHostEnvironment env, Arguments input, out
 
         [TlcModule.EntryPoint(Desc = "One-vs-All macro (OVA)",
             Name = "Models.OneVersusAll",
-            XmlInclude = new[] { @"<include file='../../../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""OVA""]'/>" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""OVA""]'/>" })]
         public static CommonOutputs.MacroOutput<Output> OVA(
             IHostEnvironment env,
             Arguments input,

From 311eb2a29bc5a706c5d72f011642fa9e4c57109a Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Mon, 16 Jul 2018 09:20:08 -0700
Subject: [PATCH 09/14] formatting tweaks, and adressing most of the code
 comments.

---
 src/Microsoft.ML.FastTree/doc.xml             | 66 +++++++++----------
 src/Microsoft.ML.KMeansClustering/doc.xml     |  6 +-
 src/Microsoft.ML.PCA/doc.xml                  | 17 +++--
 .../FactorizationMachine/doc.xml              |  4 +-
 .../Microsoft.ML.StandardLearners.csproj      |  1 -
 .../Standard/LogisticRegression/doc.xml       | 39 ++++++-----
 .../Standard/MultiClass/Pkpd.cs               |  2 +-
 .../Standard/MultiClass/doc.xml               | 16 ++---
 .../Standard/Online/doc.xml                   |  2 +-
 .../Standard/doc.xml                          | 13 +++-
 .../NAReplaceTransform.cs                     |  2 -
 src/Microsoft.ML.Transforms/Text/doc.xml      | 44 ++++---------
 src/Microsoft.ML.Transforms/doc.xml           | 33 ++++------
 13 files changed, 114 insertions(+), 131 deletions(-)

diff --git a/src/Microsoft.ML.FastTree/doc.xml b/src/Microsoft.ML.FastTree/doc.xml
index 0990c8c3dc..9506308c47 100644
--- a/src/Microsoft.ML.FastTree/doc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -24,13 +24,16 @@
           The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
         </para>
         <list type='bullet'>
-          <item>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</item>
-          <item>In case of a regression problem, the output is the predicted value of the function.</item>
-          <item>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</item>
+          <item><description> In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</description></item>
+          <item><description>In case of a regression problem, the output is the predicted value of the function.</description></item>
+          <item><description>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</description></item>
         </list>
-        <a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a>.
-        <a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a>.
-      </remarks>
+        <para>For more information see:</para>
+        <list>
+          <item><description><a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting).</a></description></item>
+          <item><description><a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine.</a></description></item>
+        </list>  
+    </remarks>
     </member>
     
     <member name="FastForest">
@@ -48,15 +51,18 @@
           <item><description>They perform integrated feature selection and classification. </description></item>
           <item><description>They are resilient in the presence of noisy features.</description></item>
         </list>
-        Fast forest is a random forest implementation.
+        <para>Fast forest is a random forest implementation.
         The model consists of an ensemble of decision trees. Each tree in a decision forest outputs a Gaussian distribution by way of prediction.
         An aggregation is performed over the ensemble of trees to find a Gaussian distribution closest to the combined distribution for all trees in the model.
-        This decision forest classifier consists of an ensemble of decision trees.
-        Generally, ensemble models provide better coverage and accuracy than single decision trees.
-        Each tree in a decision forest outputs a Gaussian distribution.
-        <a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a>
-        <a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a>
-        <a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a>
+        This decision forest classifier consists of an ensemble of decision trees.</para>
+        <para>Generally, ensemble models provide better coverage and accuracy than single decision trees.
+         Each tree in a decision forest outputs a Gaussian distribution.</para>
+         <para>For more see: </para>
+        <list>
+          <item><description><a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a></description></item>
+          <item><description><a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a></description></item>
+          <item><description><a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a></description></item>
+        </list>
       </remarks>
     </member>
 
@@ -68,7 +74,7 @@
       <remarks>
         The Tweedie boosting model follows the mathematics established in <a href="https://arxiv.org/pdf/1508.06378.pdf">
         Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models.</a> from Yang, Quan, and Zou. 
-        For an introduction to Gradient Boosting, and more information, see:
+        <para>For an introduction to Gradient Boosting, and more information, see:</para>
         <para><a href='https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting'>Wikipedia: Gradient boosting (Gradient tree boosting)</a></para>
         <para><a href='http://projecteuclid.org/DPubS?service=UI&amp;version=1.0&amp;verb=Display&amp;handle=euclid.aos/1013203451'>Greedy function approximation: A gradient boosting machine</a></para>
       </remarks>
@@ -79,15 +85,9 @@
         Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector
         to three outputs:
         <list>
-          <item>
-            <description>A vector containing the individual tree outputs of the tree ensemble.</description>
-          </item>
-          <item>
-            <description>A vector indicating the leaves that the feature vector falls on in the tree ensemble.</description>
-          </item>
-          <item>
-            <description>A vector indicating the paths that the feature vector falls on in the tree ensemble.</description>
-          </item>
+          <item><description>A vector containing the individual tree outputs of the tree ensemble.</description></item>
+          <item><description>A vector indicating the leaves that the feature vector falls on in the tree ensemble.</description></item>
+          <item><description>A vector indicating the paths that the feature vector falls on in the tree ensemble.</description></item>
         </list>
         If a both a model file and a trainer are specified - will use the model file. If neither are specified, 
         will train a default FastTree model. 
@@ -95,23 +95,17 @@
       </summary>
       <remarks>
         In machine learning​ it is a pretty common and powerful approach to utilize the already trained model in the process of defining features.
-        <para>A most obvious example could be to use the model's scores as features to downstream models. For example, we might run clustering on the original features, 
+        <para>One such example would be the use of model's scores as features to downstream models. For example, we might run clustering on the original features, 
         and use the cluster distances as the new feature set.
         Instead of consuming the model's output, we could go deeper, and extract the 'intermediate outputs' that are used to produce the final score. </para>
-        There's a number of famous or popular examples of this technique:
+        There are a number of famous or popular examples of this technique:
         <list>
-          <item>
-            <description>A deep neural net trained on the ImageNet dataset, with the last layer removed, is commonly used to compute the 'projection' of the image into the 'semantic feature space'.
+          <item><description>A deep neural net trained on the ImageNet dataset, with the last layer removed, is commonly used to compute the 'projection' of the image into the 'semantic feature space'.
             It is observed that the Euclidian distance in this space often correlates with the 'semantic similarity': that is, all pictures of pizza are located close together,
-            and far away from pictures of kittens. </description>
-          </item>
-          <item>
-            <description>A matrix factorization and/or LDA model is also often used to extract the 'latent topics' or 'latent features' associated with users and items.</description>
-          </item>
-          <item>
-            <description>The weights of the linear model are often used as a crude indicator of 'feature importance'. At the very minimum, the 0-weight features are not needed by the model,
-            and there's no reason to compute them. </description>
-          </item>
+            and far away from pictures of kittens. </description></item>
+          <item><description>A matrix factorization and/or LDA model is also often used to extract the 'latent topics' or 'latent features' associated with users and items.</description></item>
+          <item><description>The weights of the linear model are often used as a crude indicator of 'feature importance'. At the very minimum, the 0-weight features are not needed by the model,
+            and there's no reason to compute them. </description></item>
         </list>
         <para>Tree featurizer uses the decision tree ensembles for feature engineering in the same fashion as above.</para>
         <para>Let's assume that we've built a tree ensemble of 100 trees with 100 leaves each (it doesn't matter whether boosting was used or not in training). 
diff --git a/src/Microsoft.ML.KMeansClustering/doc.xml b/src/Microsoft.ML.KMeansClustering/doc.xml
index 9c26056811..e5162a7bf5 100644
--- a/src/Microsoft.ML.KMeansClustering/doc.xml
+++ b/src/Microsoft.ML.KMeansClustering/doc.xml
@@ -13,8 +13,10 @@
         YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
         It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
         <para>For more information on K-means, and K-means++ see:</para>
-        <para><a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a>.</para>
-        <para><a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a></para>
+        <list>
+          <item><description><a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a></description></item>
+          <item><description><a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a></description></item>
+        </list>
       </remarks>
     </member>
    
diff --git a/src/Microsoft.ML.PCA/doc.xml b/src/Microsoft.ML.PCA/doc.xml
index 42d3218c0f..126543fdc7 100644
--- a/src/Microsoft.ML.PCA/doc.xml
+++ b/src/Microsoft.ML.PCA/doc.xml
@@ -7,12 +7,21 @@
         PCA is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace. 
       </summary>
       <remarks>
-      <a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace.
+      <a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction algorithm which computes the projection of the feature vector to onto a low-rank subspace.
       Its training is done using the technique described in the paper: <a href='https://arxiv.org/pdf/1310.6304v2.pdf'>Combining Structured and Unstructured Randomness in Large Scale PCA</a>,
       and the paper <a href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
-      <a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
-      <a href='https://arxiv.org/abs/0809.2274'>A randomized algorithm for principal component analysis</a>
-      <a href='http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
+        <para>For more information, see also:</para>
+        <list>
+          <item><description>
+            <a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
+          </description></item>
+          <item><description>
+            <a href='https://arxiv.org/abs/0809.2274'>A randomized algorithm for principal component analysis</a>
+          </description></item>
+          <item><description>
+            <a href='http://users.cms.caltech.edu/~jtropp/papers/HMT11-Finding-Structure-SIREV.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
+          </description></item>
+        </list>
       </remarks>
       <example>
         An example of how to add the PcaCalculator transform to a pipeline with a column named &quot;Features&quot;.
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
index 5d90ea802c..aff3e80933 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
@@ -9,8 +9,8 @@
       <remarks>
         Field Aware Factorization Machines use, in addition to the input variables, factorized parameters to model the interaction between pairs of variables.
         The algorithm is particularly useful for high dimensional datasets which can be very sparse (e.g. click-prediction for advertising systems).
-        An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
-        <para> For a general idea of what Field-aware Factorization Machines are see: <a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
+        <para>An advantage of FFM over SVMs is that the training data does not need to be stored in memory, and the coefficients can be optimized directly.
+          For a general idea of what Field-aware Factorization Machines are see: <a href='https://www.csie.ntu.edu.tw/~r01922136/slides/ffm.pdf'>Field Aware Factorization Machines</a>
         </para>
         <para>See references below for more details. 
         This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
diff --git a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
index 9702f66080..6bada43299 100644
--- a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
+++ b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
@@ -10,7 +10,6 @@
     <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
     <ProjectReference Include="..\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj" />
     <ProjectReference Include="..\Microsoft.ML.Data\Microsoft.ML.Data.csproj" />
-    <ProjectReference Include="..\Microsoft.ML.FastTree\Microsoft.ML.FastTree.csproj" />
     <ProjectReference Include="..\Microsoft.ML\Microsoft.ML.csproj" />
   </ItemGroup>
 
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
index 9d1e38f237..f216edcddb 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
@@ -4,37 +4,36 @@
 
     <member name="LBFGS">
       <summary>
-        Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. 
-        The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
+        Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as 
+        a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.
       </summary>
       <remarks>
-        If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.
+        If the dependent variable has more than two possible values (blood type given diagnostic test results), 
+        then the logistic regression is multinomial.
         <para>
-          The optimization technique used for LogisticRegression Classifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS).
-          Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps.
+          The optimization technique used for LogisticRegression Classifier is based on the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS).
+          The L1 regularization is an implementation of OWLQN, based on
+          <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.68.5260">Scalable training of L1-regularized log-linear models</a>
+          Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive 
+          Hessian matrix in the equation used by Newton's method to calculate steps.
           But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction,
           so that it is especially suited for problems with a large number of variables.
-          The <paramref>MemorySize</paramref> parameter specifies the number of past positions and gradients to store for use in the computation of the next step.
+          The MemorySize argument specifies the number of past positions and gradients to store for use in the 
+          computation of the next step.
         </para>
         <para>
-          This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations.
-          Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values.
+          This learner can use elastic net regularization: a linear combination of L1 (LASSO) and L2 (ridge) regularizations.
+          Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information 
+          to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values.
           This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff.
           Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis.
-          An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
+          An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative 
+          values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
         </para>
           <list type='bullet'>
-            <item>
-              <description>
-              <paramref>L1Weight</paramref>: can be applied to sparse models, when working with high-dimensional data.
-              It pulls small weights associated features that are relatively unimportant towards 0.
-            </description>
-            </item>
-            <item>
-              <description>
-                <paramref>L2Weight</paramref>: is preferable for data that is not sparse. It pulls large weights towards zero.
-              </description>
-            </item>
+            <item><description>L1Weight can be applied to sparse models, when working with high-dimensional data.
+              It pulls small weights associated features that are relatively unimportant towards 0.</description></item>
+            <item><description>L2Weight is preferable for data that is not sparse. It pulls large weights towards zero.</description></item>
           </list>
           Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms.
           The default values of x and y are both 1.
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
index ffda5f9819..1e26987cdb 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
@@ -44,7 +44,7 @@ namespace Microsoft.ML.Runtime.Learners
     /// pair.
     ///
     /// These two can allow you to exploit trainers that do not naturally have a
-    /// multiclass option, e.g., using <see cref="Microsoft.ML.Runtime.FastTree.FastTreeBinaryClassificationTrainer"/> 
+    /// multiclass option, e.g., using the Runtime.FastTree.FastTreeBinaryClassificationTrainer 
     /// to solve a multiclass problem.
     /// Alternately, it can allow ML.NET to solve a "simpler" problem even in the cases
     /// where the trainer has a multiclass option, but using it directly is not
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
index 68faa53632..09b0fbc4ce 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
@@ -18,10 +18,10 @@
         classifier indicates that it does not need caching, OVA will always
         request caching, as it will be performing multiple passes over the data set.
       </remarks>
-      <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'></seealso>
-      <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'></seealso>
-      <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'></seealso>
-      <seealso cref='Microsoft.ML.Models.OneVersusAll'></seealso>
+      <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'/>
+      <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'/>
+      <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'/>
+      <seealso cref='Microsoft.ML.Models.OneVersusAll'/>
       <example>
         <code>
           pipeline.Add(new NaiveBayesClassifier(){ NormalizeFeatures = NormalizeOption.Auto, Caching = CachingOptions.Memory });
@@ -44,10 +44,10 @@
           can be different from LightGbmClassifier, which develops a multi-class classifier directly. 
         </para>
       </remarks>
-      <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'>LogisticRegressionClassifier</seealso>
-      <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'>LightGbmClassifier</seealso>
-      <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'>StochasticDualCoordinateAscentClassifier</seealso>
-      <seealso cref='Microsoft.ML.Trainers.NaiveBayesClassifier'>NaiveBayesClassifier</seealso>
+      <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'/>
+      <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'/>
+      <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'/>
+      <seealso cref='Microsoft.ML.Trainers.NaiveBayesClassifier'/>
       <example>
         <code>
           pipeline.Add(OneVersusAll.With(new StochasticDualCoordinateAscentBinaryClassifier()));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
index c6cd70789f..2c1e90a9cf 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
@@ -5,7 +5,7 @@
     <member name="OGD">
       <summary>
         Stochastic gradient descent is an optimization method used to train a wide range of models in machine learning. 
-        In the ML.Net the implementation of OGD, it is for linear regression. 
+        In the ML.Net the implementation of OGD, is for linear regression. 
       </summary>
       <remarks>
         Stochastic gradient descent uses a simple yet efficient iterative technique to fit model coefficients using error gradients for convex loss functions.
diff --git a/src/Microsoft.ML.StandardLearners/Standard/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
index e435fc154d..fcb289af81 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
@@ -21,9 +21,16 @@
           Elastic net regularization can be specified by the 'L2Const' and 'L1Threshold' parameters. Note that the 'L2Const' has an effect on the rate of convergence.
           In general, the larger the 'L2Const', the faster SDCA converges.
         </para>
-        <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
-        <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
-      </remarks>
+        <para>For more information, see also:</para>
+        <list>
+          <item><description>
+            <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
+          </description></item>
+          <item><description>
+            <a href='http://www.jmlr.org/papers/volume14/shalev-shwartz13a/shalev-shwartz13a.pdf'>Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization</a>.
+          </description></item>
+        </list>
+       </remarks>
     </member>
  
   </members>
diff --git a/src/Microsoft.ML.Transforms/NAReplaceTransform.cs b/src/Microsoft.ML.Transforms/NAReplaceTransform.cs
index 367e9fe491..ffd1ac5c74 100644
--- a/src/Microsoft.ML.Transforms/NAReplaceTransform.cs
+++ b/src/Microsoft.ML.Transforms/NAReplaceTransform.cs
@@ -27,12 +27,10 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    // <summary>
     // This transform can transform either scalars or vectors (both fixed and variable size),
     // creating output columns that are identical to the input columns except for replacing NA values
     // with either the default value, user input, or imputed values (min/max/mean are currently supported).
     // Imputation modes are supported for vectors both by slot and across all slots.
-    // </summary>
     // REVIEW: May make sense to implement the transform template interface.
     /// <include file='doc.xml' path='doc/members/member[@name="NAReplace"]/*' />
     public sealed partial class NAReplaceTransform : OneToOneTransformBase
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
index 5e4ffead3f..569f083da3 100644
--- a/src/Microsoft.ML.Transforms/Text/doc.xml
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -10,27 +10,13 @@
       <remarks>
         The TextFeaturizer transform gives user one-stop solution for doing:
         <list type="bullet">
-          <item>
-            <description>Language Detection</description>
-          </item>
-          <item>
-            <description>Tokenzation​</description>
-          </item>
-          <item>
-            <description>Text normalization</description>
-          </item>
-          <item>
-            <description>Predefined and custom stopwords removal.</description>
-          </item>
-          <item>
-            <description>Word-based or character-based Ngram and SkipGram extraction.​</description>
-          </item>
-          <item>
-            <description>TF, IDF or TF-IDF.</description>
-          </item>
-          <item>
-            <description>L-p vector normalization.​</description>
-          </item>
+          <item><description>Language Detection</description></item>
+          <item><description>Tokenzation​</description></item>
+          <item><description>Text normalization</description></item>
+          <item><description>Predefined and custom stopwords removal.</description></item>
+          <item><description>Word-based or character-based Ngram and SkipGram extraction.​</description></item>
+          <item><description>TF, IDF or TF-IDF.</description></item>
+          <item><description>L-p vector normalization.​</description></item>
         </list>
         The TextFeaturizer will show the transformed text, after being applied.
         It converts a collection of text columns to a matrix of token  ngrams/skip-grams counts.
@@ -83,18 +69,12 @@
         Embedding ngrams in a vector space allows their contents to be compared in an efficient manner. 
         The slot values in the vector can be weighted by the following factors:
         <list>
-          <item>
-            <description>term frequency - The number of occurrences of the slot in the text</description>
-          </item>
-          <item>
-            <description>
+          <item><description>term frequency - The number of occurrences of the slot in the text</description></item>
+          <item><description>
               inverse document frequency - A ratio (the logarithm of inverse relative slot frequency)
               that measures the information a slot provides by determining how common or rare it is across the entire text.
-            </description>
-          </item>
-            <item>
-              <description>term frequency-inverse document frequency - the product term frequency and the inverse document frequency.</description>
-            </item>
+            </description></item>
+            <item><description>term frequency-inverse document frequency - the product term frequency and the inverse document frequency.</description></item>
         </list>
         This transform is not typically used on its own, but it is one of the transforms composing the <see cref="Microsoft.ML.Transforms.TextFeaturizer">Text Featurizer</see> .
       </remarks>
@@ -132,7 +112,7 @@
 
     <member name="CharacterTokenizer">
       <summary>
-        Character-oriented tokenizer where text is considered a sequence of characters. 
+        This transform breaks text into individual tokens, each consisting of individual character.
       </summary>
       <remarks>
       This transform is not typically used on its own, but it is one of the transforms composing the 
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index 20e463ecd0..a3230b82c2 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -249,23 +249,18 @@
         <para>Variable-length pivot columns are supported (including zero, which will eliminate the row from the result).</para>
         <para>Multiple pivot columns are also supported:</para>
         <list type="bullet">
-          <item>
-            <description>A number of output rows is controlled by the 'mode' parameter. 
+          <item><description>A number of output rows is controlled by the 'mode' parameter. 
             <list type="bullet">
-              <item>
-                <description>outer: it is equal to the maximum length of pivot columns</description>
-                <description>inner: it is equal to the minimum length of pivot columns</description>
-                <description>first: it is equal to the length of the first pivot column</description>
-              </item>
+              <item>outer<description> it is equal to the maximum length of pivot columns</description></item>
+              <item>inner<description> it is equal to the minimum length of pivot columns</description></item>
+              <item>first<description> it is equal to the length of the first pivot column</description></item>
             </list>
             </description>
           </item>
-          <item>
-            <description>
+          <item><description>
               If a particular pivot column has size that is different than the number of output rows, the extra slots will
               be ignored, and the missing slots will be 'padded' with default values.
-            </description>
-          </item>
+            </description></item>
         </list>
         <para>All metadata is preserved for the retained columns. For 'unrolled' columns, all known metadata
         except slot names is preserved.
@@ -300,19 +295,19 @@
         </code>
       </example>
     </member>
-    
+
     <member name="KeyToText">
       <summary>
-       The KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the 
-       KeyValues metadata.
+        The KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the
+        KeyValues metadata.
       </summary>
       <remarks>
-        The KeyToTextConverter is the complement of the <see  cref="TextToKeyConverter"/> transform. 
-        Since key values are an enumeration into the set of keys, most transforms that produce key valued outputs 
+        The KeyToTextConverter is the complement of the <see  cref="TextToKeyConverter"/> transform.
+        Since key values are an enumeration into the set of keys, most transforms that produce key valued outputs
         corresponding to input values will often, wherever possible, associate a piece of KeyValue metadata with that dataset.
-        Transforming values into a categorical variable would be of limited use, 
-        if we couldn't somehow backtrack to figure out what those categories actually mean. 
-        The KeyToTextConverter enables that functionality. 
+        Transforming values into a categorical variable would be of limited use,
+        if we couldn't somehow backtrack to figure out what those categories actually mean.
+        The KeyToTextConverter enables that functionality.
       </remarks>
       <seealso cref="Microsoft.ML.Transforms.HashConverter"/>
       <seealso cref="Microsoft.ML.Transforms.TextToKeyConverter"/>

From 8e91165f0c5bbf85216d48e64e2927cf87c077e6 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Tue, 17 Jul 2018 10:17:51 -0700
Subject: [PATCH 10/14] Extracted the examples outside of the member nodes in
 the xml, so that they only appear in the CSharpApi classes, and not on the
 runtime classes.

---
 src/Microsoft.ML.Data/Transforms/doc.xml      |  47 +++---
 .../FastTreeClassification.cs                 |   5 +-
 src/Microsoft.ML.FastTree/FastTreeRanking.cs  |   3 +-
 .../FastTreeRegression.cs                     |   3 +-
 .../RandomForestClassification.cs             |   3 +-
 .../RandomForestRegression.cs                 |   3 +-
 src/Microsoft.ML.FastTree/doc.xml             |  64 +++++++-
 .../KMeansPlusPlusTrainer.cs                  |   3 +-
 src/Microsoft.ML.KMeansClustering/doc.xml     |  12 ++
 .../LightGbmBinaryTrainer.cs                  |   3 +-
 .../LightGbmMulticlassTrainer.cs              |   3 +-
 .../LightGbmRankingTrainer.cs                 |   3 +-
 .../LightGbmRegressionTrainer.cs              |   3 +-
 src/Microsoft.ML.LightGBM/doc.xml             |  63 ++++++++
 src/Microsoft.ML.PCA/PcaTrainer.cs            |   3 +-
 src/Microsoft.ML.PCA/PcaTransform.cs          |   3 +-
 src/Microsoft.ML.PCA/doc.xml                  |  18 ++-
 .../FactorizationMachineTrainer.cs            |   3 +-
 .../FactorizationMachine/doc.xml              |  17 +-
 .../Standard/LinearClassificationTrainer.cs   |   3 +-
 .../MulticlassLogisticRegression.cs           |   4 +-
 .../Standard/LogisticRegression/doc.xml       |  14 +-
 .../MultiClass/MultiClassNaiveBayesTrainer.cs |   3 +-
 .../Standard/MultiClass/doc.xml               |   8 +-
 .../Standard/Online/AveragedPerceptron.cs     |   3 +-
 .../Standard/Online/OnlineGradientDescent.cs  |   3 +-
 .../Standard/Online/doc.xml                   |  24 +++
 .../PoissonRegression/PoissonRegression.cs    |   3 +-
 .../Standard/PoissonRegression/doc.xml        |  11 ++
 .../Standard/SdcaMultiClass.cs                |   3 +-
 .../Standard/SdcaRegression.cs                |   3 +-
 .../Standard/doc.xml                          |  37 +++++
 .../CategoricalTransform.cs                   |   9 +-
 .../EntryPoints/SelectFeatures.cs             |   6 +-
 .../EntryPoints/TextAnalytics.cs              |  12 +-
 .../HashJoinTransform.cs                      |   3 +-
 src/Microsoft.ML.Transforms/NAHandling.cs     |  15 +-
 .../OptionalColumnTransform.cs                |   3 +-
 src/Microsoft.ML.Transforms/Text/doc.xml      |  65 +++++---
 .../UngroupTransform.cs                       |   3 +-
 src/Microsoft.ML.Transforms/doc.xml           | 148 +++++++++++-------
 41 files changed, 477 insertions(+), 168 deletions(-)

diff --git a/src/Microsoft.ML.Data/Transforms/doc.xml b/src/Microsoft.ML.Data/Transforms/doc.xml
index a7239debe1..808595147e 100644
--- a/src/Microsoft.ML.Data/Transforms/doc.xml
+++ b/src/Microsoft.ML.Data/Transforms/doc.xml
@@ -13,44 +13,39 @@
         it will keep only the rows that have missing values.
       </remarks>
       <seealso cref="Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+    </member>
+    <example name="NAFilter">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new MissingValuesRowDropper(&quot;Column1&quot;));
         </code>
       </example>
-    </member>
+    </example>
 
-    <member name="NAHandle">
+    <member name="TextToKey">
       <summary>
-        Handle missing values by replacing them with either the default value or the indicated value. 
+        Converts input values (words, numbers, etc.) to index in a dictionary.
       </summary>
       <remarks>
-        This transform handles missing values in the input columns. For each input column, it creates an output column
-         where the missing values are replaced by one of these specified values:
-         <list type="bullet">
-           <item><description>The default value of the appropriate type.</description></item>
-           <item><description>The mean value of the appropriate type.</description></item>
-           <item><description>The max value of the appropriate type.</description></item>
-           <item><description>The min value of the appropriate type.</description></item>
-         </list>
-         <para>The last three work only for numeric/TimeSpan/DateTime kind columns.</para>       
-         <para> The output column can also optionally include an indicator vector for which slots were missing in the input column.
-         This can be done only when the indicator vector type can be converted to the input column type, i.e. only for numeric columns.
-         </para>
-         <para>
-           When computing the mean/max/min value, there is also an option to compute it over the whole column instead of per slot.
-           This option has a default value of true for variable length vectors, and false for known length vectors. 
-           It can be changed to true for known length vectors, but it results in an error if changed to false for variable length vectors.
-         </para>
+        The TextToKeyConverter transform builds up term vocabularies (dictionaries).
+        The TextToKey Converter and the <see cref="T:Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys.
+        If multiple columns are used, each column builds/uses exactly one vocabulary (dictionary).
+        The output columns are KeyType-valued.
+        The Key value is the one-based index of the item in the dictionary.
+        If the key is not found in the dictionary, it is assigned the missing value indicator.
+        This dictionary mapping values to keys is most commonly learnt from the unique values in input data,
+        but can be defined through other means: either with the mapping defined directly on the command line, or as loaded from an external file.
       </remarks>
-      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"/>
-      <seealso cref="Microsoft.ML.Data.DataKind"/>
+      <seealso cref="T:Microsoft.ML.Transforms.HashConverter"/>
+      <seealso cref="T:Microsoft.ML.Transforms.KeyToTextConverter"/>
+    </member>
+    <example name="TextToKey">
       <example>
-        <code>
-          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;) { ReplaceWith  = NAHandleTransformReplacementKind.Mean });
+        <code language="csharp">
+          pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;)){ Sort = TermTransformSortOrder.Occurrence });
         </code>
       </example>
-    </member>
+    </example>
     
   </members>
 </doc>
diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
index 1eb2a7b6f9..041aff96d0 100644
--- a/src/Microsoft.ML.FastTree/FastTreeClassification.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
@@ -100,7 +100,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } }
     }
 
-    /// <include file = './doc.xml' path='doc/members/member[@name="FastTree"]/*' />
+    /// <include file = 'doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer :
         BoostingFastTreeTrainerBase<FastTreeBinaryClassificationTrainer.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -346,7 +346,8 @@ public static partial class FastTree
             Desc = FastTreeBinaryClassificationTrainer.Summary,
             UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
             ShortName = FastTreeBinaryClassificationTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
index 6a17dc2d13..ebd5644553 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRanking.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
@@ -1101,7 +1101,8 @@ public static partial class FastTree
             Desc = FastTreeRankingTrainer.Summary,
             UserName = FastTreeRankingTrainer.UserNameValue,
             ShortName = FastTreeRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeRanker""]/*' />"})]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
index c625594946..f18940f212 100644
--- a/src/Microsoft.ML.FastTree/FastTreeRegression.cs
+++ b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
@@ -453,7 +453,8 @@ public static partial class FastTree
             Desc = FastTreeRegressionTrainer.Summary,
             UserName = FastTreeRegressionTrainer.UserNameValue,
             ShortName = FastTreeRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeRegressor""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
index 86daffd712..a087674a09 100644
--- a/src/Microsoft.ML.FastTree/RandomForestClassification.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
@@ -213,7 +213,8 @@ public static partial class FastForest
             Desc = FastForestClassification.Summary,
             UserName = FastForestClassification.UserNameValue,
             ShortName = FastForestClassification.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastForestBinaryClassifier""]/*' />"})]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
index 68817b420d..627f65e7c5 100644
--- a/src/Microsoft.ML.FastTree/RandomForestRegression.cs
+++ b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
@@ -285,7 +285,8 @@ public static partial class FastForest
             Desc = FastForestRegression.Summary,
             UserName = FastForestRegression.LoadNameValue,
             ShortName = FastForestRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastForestRegressor""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/doc.xml b/src/Microsoft.ML.FastTree/doc.xml
index 9506308c47..170ae8bc9d 100644
--- a/src/Microsoft.ML.FastTree/doc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -35,6 +35,43 @@
         </list>  
     </remarks>
     </member>
+    <example name='FastTreeRanker'>
+      <example>
+        <code language="csharp">
+          new FastTreeRanker
+          {
+            SortingAlgorithm = "DescendingReverse",
+            OptimizationAlgorithm = BoostedTreeArgsOptimizationAlgorithmType.AcceleratedGradientDescent
+          }
+        </code>
+      </example>
+    </example>
+    <example name='FastTreeRegressor'>
+      <example>
+        <code language="csharp">
+          new FastTreeRegressor
+          {
+            NumTrees = 200,
+            EarlyStoppingRule = new GLEarlyStoppingCriterion(),
+            LearningRates = 0.4f,
+            DropoutRate = 0.05f
+          }
+        </code>
+      </example>
+    </example>
+    <example name='FastTreeBinaryClassifier'>
+      <example>
+        <code language="csharp">
+          new FastTreeBinaryClassifier
+          {
+            NumTrees = 100,
+            EarlyStoppingRule = new PQEarlyStoppingCriterion(),
+            LearningRates = 0.4f,
+            DropoutRate = 0.05f
+          }
+        </code>
+      </example>
+    </example>
     
     <member name="FastForest">
       <summary>
@@ -65,6 +102,31 @@
         </list>
       </remarks>
     </member>
+    <example name='FastForestBinaryClassifier'>
+      <example>
+        <code language="csharp">
+          new FastForestBinaryClassifier
+          {
+            NumTrees = 100,
+            NumLeaves = 50,
+            Calibrator = new FixedPlattCalibratorCalibratorTrainer()
+          }
+        </code>
+      </example>
+    </example>
+    <example name='FastForestRegressor'>
+      <example>
+        <code language="csharp">
+          new FastForestRegressor
+          {
+            NumTrees = 100,
+            NumLeaves = 50,
+            NumThreads = 5,
+            EntropyCoefficient = 0.3
+          }
+        </code>
+      </example>
+    </example>
 
     <member name="FastTreeTweedieRegression">
       <summary>
@@ -120,7 +182,7 @@
         <para>The TreeLeafFeaturizer is also producing the third vector, T, which is defined as Ti(x) = output of tree #i on example x.</para>
       </remarks>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new TreeLeafFeaturizer())
         </code>
       </example>
diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
index 66dcfaaeca..e51db16349 100644
--- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
+++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs
@@ -213,7 +213,8 @@ private static int ComputeNumThreads(IHost host, int? argNumThreads)
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/member[@name=""KMeans++""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/member[@name=""KMeans++""]/*' />",
+                                 @"<include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/example[@name=""KMeans++""]/*' />"})]
         public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.KMeansClustering/doc.xml b/src/Microsoft.ML.KMeansClustering/doc.xml
index e5162a7bf5..a1590595dc 100644
--- a/src/Microsoft.ML.KMeansClustering/doc.xml
+++ b/src/Microsoft.ML.KMeansClustering/doc.xml
@@ -19,6 +19,18 @@
         </list>
       </remarks>
     </member>
+    <example name="KMeans++">
+      <example>
+        <code language="csharp">
+          new KMeansPlusPlusClusterer
+          {
+            MaxIterations = 100,
+            NumThreads = 5,
+            InitAlgorithm = KMeansPlusPlusTrainerInitAlgorithm.KMeansParallel
+          }
+        </code>
+      </example>
+    </example>
    
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
index 6806b00110..7b5a0da0f1 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
@@ -134,7 +134,8 @@ public static partial class LightGbm
             Desc = LightGbmBinaryTrainer.Summary,
             UserName = LightGbmBinaryTrainer.UserName, 
             ShortName = LightGbmBinaryTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />",
+                                 @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name=""LightGbmBinaryClassifier""]/*' />"})]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
index 858013388c..e359959db3 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
@@ -185,7 +185,8 @@ public static partial class LightGbm
             Desc = "Train a LightGBM multi class model.", 
             UserName = LightGbmMulticlassTrainer.Summary, 
             ShortName = LightGbmMulticlassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />",
+                                 @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name=""LightGbmClassifier""]/*' />"})]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
index 659e4239d7..dfe0426ad2 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
@@ -132,7 +132,8 @@ public static partial class LightGbm
             Desc = "Train a LightGBM ranking model.", 
             UserName = LightGbmRankingTrainer.UserName, 
             ShortName = LightGbmRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />",
+                                 @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name=""LightGbmRanker""]/*' />"})]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
index 13c254a509..67d8b8cd4b 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
@@ -124,7 +124,8 @@ public static partial class LightGbm
             Desc = LightGbmRegressorTrainer.Summary, 
             UserName = LightGbmRegressorTrainer.UserNameValue, 
             ShortName = LightGbmRegressorTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name=""LightGBM""]/*' />",
+                                 @"<include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name=""LightGbmRegressor""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, LightGbmArguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/doc.xml b/src/Microsoft.ML.LightGBM/doc.xml
index b32473485a..1fcd38dd7a 100644
--- a/src/Microsoft.ML.LightGBM/doc.xml
+++ b/src/Microsoft.ML.LightGBM/doc.xml
@@ -11,6 +11,69 @@
         <a href='https://github.com/Microsoft/LightGBM/wiki'>GitHub: LightGBM</a>
       </remarks>
     </member>
+    <example name='LightGbmBinaryClassifier'>
+      <example>
+        <code language="csharp">
+          new LightGbmBinaryClassifier
+          {
+            NumBoostRound = 200,
+            LearningRate = 0.5f,
+            NumLeaves = 32,
+            MinDataPerLeaf = 20
+          }
+        </code>
+      </example>
+    </example>
+    <example name='LightGbmClassifier'>
+      <example>
+        <code language="csharp">
+          new LightGbmClassifier
+          {
+            NumBoostRound = 200,
+            LearningRate = 0.5f,
+            NumLeaves = 32,
+            MinDataPerLeaf = 20
+          }
+        </code>
+      </example>
+    </example>
+    <example name='LightGbmRegressor'>
+      <example>
+        <code language="csharp">
+          new LightGbmRegressor
+          {
+            NumBoostRound = 100,
+            LearningRate = 0.5f,
+            NumLeaves = 32,
+            MinDataPerLeaf = 20,
+            Booster = new DartBoosterParameterFunction
+            {
+              XgboostDartMode = true,
+              UniformDrop = true
+            }
+          }
+        </code>
+      </example>
+    </example>
+    <example name='LightGbmRanker'>
+      <example>
+        <code language="csharp">
+          new LightGbmRanker
+          {
+            NumBoostRound = 100,
+            LearningRate = 0.5f,
+            NumLeaves = 32,
+            MinDataPerLeaf = 20,
+            Booster = new GbdtBoosterParameterFunction
+            {
+              MinSplitGain = 3,
+              MaxDepth = 200,
+              Subsample = 0.5
+            }
+          }
+        </code>
+      </example>
+    </example>
 
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index 840744f6f3..6bca9e83f0 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -288,7 +288,8 @@ private static void PostProcess(VBuffer<Float>[] y, Float[] sigma, Float[] z, in
             Desc = "Train an PCA Anomaly model.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name=""PCA""]/*' />",
+                                 @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/example[@name=""PcaAnomalyDetector""]/*' />" })]
         public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.PCA/PcaTransform.cs b/src/Microsoft.ML.PCA/PcaTransform.cs
index 3973d443b8..6efbead226 100644
--- a/src/Microsoft.ML.PCA/PcaTransform.cs
+++ b/src/Microsoft.ML.PCA/PcaTransform.cs
@@ -541,7 +541,8 @@ private static void TransformFeatures(IExceptionContext ectx, ref VBuffer<Float>
             Desc = Summary,
             UserName = UserName, 
             ShortName = ShortName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name=""PCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name=""PCA""]/*' />",
+                                 @"<include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/example[@name=""PcaCalculator""]/*' />"})]
         public static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input);
diff --git a/src/Microsoft.ML.PCA/doc.xml b/src/Microsoft.ML.PCA/doc.xml
index 126543fdc7..66bf06cc30 100644
--- a/src/Microsoft.ML.PCA/doc.xml
+++ b/src/Microsoft.ML.PCA/doc.xml
@@ -23,14 +23,28 @@
           </description></item>
         </list>
       </remarks>
+    </member>
+    <example name='PcaCalculator'>
       <example>
         An example of how to add the PcaCalculator transform to a pipeline with a column named &quot;Features&quot;.
-        <code>
+        <code language="csharp">
           string[] features = new string[&quot;Sepal length&quot;, &quot;Sepal width&quot;, &quot;Petal length&quot;, &quot;Petal width&quot;];
           pipeline.Add(new PcaCalculator(columns){ Rank = 3 });
         </code>
       </example>
-    </member>
+    </example>
+    <example name='PcaAnomalyDetector'>
+      <example>
+        <code language="csharp">
+          new PcaAnomalyDetector
+          {
+            Rank = 40,
+            Oversampling = 40,
+            NormalizeFeatures = Microsoft.ML.Models.NormalizeOption.Warn
+          }
+        </code>
+      </example>
+    </example>
     
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
index f14138d881..1711000ad9 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/FactorizationMachineTrainer.cs
@@ -407,7 +407,8 @@ public override FieldAwareFactorizationMachinePredictor CreatePredictor()
             Desc = Summary,
             UserName = UserName,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/member[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/example[@name=""FieldAwareFactorizationMachineBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
index aff3e80933..3a2e5f72ad 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
@@ -17,26 +17,23 @@
         </para>
           <list >
             <item>
-              <description>
-                [1] <a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
+              [1] <description><a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
             <item>
-              <description>
-                [2] <a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a>
-              </description>
+              [2] <description><a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a></description>
             </item>
             <item>
-              <description>
-                [3] <a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a>
-              </description>
+              [3] <description><a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a></description>
             </item>
           </list>
       </remarks>
+    </member>
+    <example name="FieldAwareFactorizationMachineBinaryClassifier">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new FieldAwareFactorizationMachineBinaryClassifier(){ LearningRate = 0.5f, Iter=2 });
         </code>
       </example>
-    </member>
+    </example>
         
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
index 90e9f60a20..13cd0611bf 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs
@@ -1779,7 +1779,8 @@ public static partial class Sdca
             Desc = "Train an SDCA binary model.",
             UserName = LinearClassificationTrainer.UserNameValue,
             ShortName = LinearClassificationTrainer.LoadNameValue,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />", 
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/example[@name=""StochasticDualCoordinateAscentBinaryClassifier""]/*'/>" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LinearClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 4ddb873f6e..8067266577 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -36,8 +36,8 @@
 
 namespace Microsoft.ML.Runtime.Learners
 {
-    /// <include file = './doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
-    /// <include file = './doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+    /// <include file = 'doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
+    /// <include file = 'doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
     public sealed class MulticlassLogisticRegression : LbfgsTrainerBase<VBuffer<Float>, MulticlassLogisticRegressionPredictor>
     {
         public const string LoadNameValue = "MultiClassLogisticRegression";
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
index f216edcddb..03b844a0ea 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml
@@ -12,8 +12,6 @@
         then the logistic regression is multinomial.
         <para>
           The optimization technique used for LogisticRegression Classifier is based on the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS).
-          The L1 regularization is an implementation of OWLQN, based on
-          <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.68.5260">Scalable training of L1-regularized log-linear models</a>
           Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive 
           Hessian matrix in the equation used by Newton's method to calculate steps.
           But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction,
@@ -31,8 +29,12 @@
           values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.
         </para>
           <list type='bullet'>
-            <item><description>L1Weight can be applied to sparse models, when working with high-dimensional data.
-              It pulls small weights associated features that are relatively unimportant towards 0.</description></item>
+            <item><description>
+              L1Weight can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features
+              that are relatively unimportant towards 0.
+              L1 regularization is an implementation of OWLQN, based on:
+              <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.68.5260">Scalable training of L1-regularized log-linear models</a>
+            </description></item>
             <item><description>L2Weight is preferable for data that is not sparse. It pulls large weights towards zero.</description></item>
           </list>
           Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms.
@@ -49,14 +51,14 @@
     </member>
     <example name='LogisticRegressionClassifier'>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new LogisticRegressionClassifier());
         </code>
       </example>
     </example>
     <example name='LogisticRegressionBinaryClassifier'>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new LogisticRegressionBinaryClassifier());
         </code>
       </example>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
index 9fbb29b316..fdfb76f218 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
@@ -129,7 +129,8 @@ public override MultiClassNaiveBayesPredictor CreatePredictor()
             Desc = "Train a MultiClassNaiveBayesTrainer.",
             UserName = UserName, 
             ShortName = ShortName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""MultiClassNaiveBayesTrainer""]'/>" } )]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""MultiClassNaiveBayesTrainer""]'/>",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/example[@name=""MultiClassNaiveBayesTrainer""]'/>" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClassNaiveBayesTrainer(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
index 09b0fbc4ce..e4e6580a19 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
@@ -22,12 +22,14 @@
       <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'/>
       <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'/>
       <seealso cref='Microsoft.ML.Models.OneVersusAll'/>
+    </member>
+    <example name="MultiClassNaiveBayesTrainer">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new NaiveBayesClassifier(){ NormalizeFeatures = NormalizeOption.Auto, Caching = CachingOptions.Memory });
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="OVA">
       <summary>
@@ -49,7 +51,7 @@
       <seealso cref='Microsoft.ML.Trainers.StochasticDualCoordinateAscentClassifier'/>
       <seealso cref='Microsoft.ML.Trainers.NaiveBayesClassifier'/>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(OneVersusAll.With(new StochasticDualCoordinateAscentBinaryClassifier()));
         </code>
       </example>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
index 371a5bde58..e77c3975a1 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/AveragedPerceptron.cs
@@ -94,7 +94,8 @@ public override LinearBinaryPredictor CreatePredictor()
             Desc = Summary,
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name=""AP""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name=""AP""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/example[@name=""AP""]/*' />"})]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
index 4e6bd89b8f..9b132ef8d1 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/OnlineGradientDescent.cs
@@ -94,7 +94,8 @@ public override TPredictor CreatePredictor()
             Desc = "Train a Online gradient descent perceptron.",
             UserName = UserNameValue,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name=""OGD""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name=""OGD""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/example[@name=""OGD""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
index 2c1e90a9cf..08d1c161f9 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
@@ -13,6 +13,18 @@
         and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
       </remarks>
     </member>
+    <example>
+      <example name="OGD">
+        <code language="csharp">
+          new OnlineGradientDescentRegressor()
+          {
+            NumIterations = 10,
+            L2RegularizerWeight = 0.6f,
+            LossFunction = new PoissonLossRegressionLossFunction()
+          }
+        </code>
+      </example>
+    </example>
 
     <member name="AP">
       <summary>
@@ -39,6 +51,18 @@
         <para><a href='http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.48.8200'>Large Margin Classification Using the Perceptron Algorithm</a></para>
       </remarks>
     </member>
+    <example>
+      <example name="AP">
+        <code language="csharp">
+          new AveragedPerceptronBinaryClassifier()
+          {
+            NumIterations = 10,
+            L2RegularizerWeight = 0.01f,
+            LossFunction = new ExpLossClassificationLossFunction()
+          }
+        </code>
+      </example>
+    </example>
 
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
index 356c06b07d..1aecf72291 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/PoissonRegression.cs
@@ -129,7 +129,8 @@ protected override void ProcessPriorDistribution(Float label, Float weight)
             Desc = "Train an Poisson regression model.", 
             UserName = UserNameValue, 
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/member[@name=""PoissonRegression""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/member[@name=""PoissonRegression""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/example[@name=""PoissonRegression""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
index 9e7f4eee38..62daf1ecf3 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
@@ -12,6 +12,17 @@
         Assuming that the dependent variable follows a Poisson distribution, the parameters of the regressor can be estimated by maximizing the likelihood of the obtained observations.
       </remarks>
     </member>
+    <example>
+      <example name="PoissonRegression">
+        <code language="csharp">
+          new PoissonRegressor()
+          {
+            MaxIterations = 100,
+            L2Weight = 0.6f
+          }
+        </code>
+      </example>
+    </example>
    
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
index a256f7ae79..3885e3d8f9 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaMultiClass.cs
@@ -389,7 +389,8 @@ public static partial class Sdca
             Desc = SdcaMultiClassTrainer.Summary,
             UserName = SdcaMultiClassTrainer.UserNameValue,
             ShortName = SdcaMultiClassTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/example[@name=""StochasticDualCoordinateAscentClassifier""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, SdcaMultiClassTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
index 554603a7d9..f4eaf32c99 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaRegression.cs
@@ -136,7 +136,8 @@ public static partial class Sdca
             Desc = SdcaRegressionTrainer.Summary,
             UserName = SdcaRegressionTrainer.UserNameValue,
             ShortName = SdcaRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name=""SDCA""]/*' />",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/example[@name=""StochasticDualCoordinateAscentRegressor""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, SdcaRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
index fcb289af81..acef787636 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
@@ -32,6 +32,43 @@
         </list>
        </remarks>
     </member>
+    <example name="StochasticDualCoordinateAscentBinaryClassifier">
+      <example>
+        <code language="csharp">
+          new StochasticDualCoordinateAscentBinaryClassifier()
+          {
+            MaxIterations = 100,
+            NumThreads = 7,
+            LossFunction = new SmoothedHingeLossSDCAClassificationLossFunction(),
+            Caching = Microsoft.ML.Models.CachingOptions.Disk
+          }
+        </code>
+      </example>
+    </example>
+    <example name="StochasticDualCoordinateAscentClassifier">
+      <example>
+        <code language="csharp">
+          new StochasticDualCoordinateAscentClassifier()
+          {
+            MaxIterations = 100,
+            NumThreads = 7,
+            LossFunction = new SmoothedHingeLossSDCAClassificationLossFunction()
+          }
+        </code>
+      </example>
+    </example>
+    <example name="StochasticDualCoordinateAscentRegressor">
+      <example>
+        <code language="csharp">
+          new StochasticDualCoordinateAscentRegressor
+          {
+            MaxIterations = 100,
+            NumThreads = 5,
+            LossFunction = new
+          }
+        </code>
+      </example>
+    </example>
  
   </members>
 </doc>
\ No newline at end of file
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index 40dd657af7..2d4e9e1807 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -246,7 +246,8 @@ public static class Categorical
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", 
             Desc = CategoricalTransform.Summary,
             UserName = CategoricalTransform.UserName, 
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CategoricalOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CategoricalOneHotVectorizer""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""CategoricalOneHotVectorizer""]/*' />"})]
         public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, CategoricalTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -261,7 +262,8 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en
         [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", 
             Desc = CategoricalHashTransform.Summary,
             UserName = CategoricalHashTransform.UserName ,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CategoricalHashOneHotVectorizer""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""CategoricalHashOneHotVectorizer""]/*' />"})]
         public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, CategoricalHashTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -276,7 +278,8 @@ public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment en
         [TlcModule.EntryPoint(Name = "Transforms.TextToKeyConverter",
             Desc = TermTransform.Summary, 
             UserName = TermTransform.UserName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""TextToKey""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""TextToKey""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""TextToKey""]/*' />" })]
         public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, TermTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs b/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
index d4791f59d5..2d7246763f 100644
--- a/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
+++ b/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
@@ -14,7 +14,8 @@ public static class SelectFeatures
         [TlcModule.EntryPoint(Name = "Transforms.FeatureSelectorByCount", 
             Desc = CountFeatureSelectionTransform.Summary, 
             UserName = CountFeatureSelectionTransform.UserName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CountFeatureSelection""]'/>" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CountFeatureSelection""]'/>",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""CountFeatureSelection""]'/>"})]
         public static CommonOutputs.TransformOutput CountSelect(IHostEnvironment env, CountFeatureSelectionTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -30,7 +31,8 @@ public static CommonOutputs.TransformOutput CountSelect(IHostEnvironment env, Co
             Desc = MutualInformationFeatureSelectionTransform.Summary, 
             UserName = MutualInformationFeatureSelectionTransform.UserName, 
             ShortName = MutualInformationFeatureSelectionTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""MutualInformationFeatureSelection""]'/>" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""MutualInformationFeatureSelection""]'/>",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""MutualInformationFeatureSelection""]'/>"})]
         public static CommonOutputs.TransformOutput MutualInformationSelect(IHostEnvironment env, MutualInformationFeatureSelectionTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
index 8eba9a2743..b37ecd5f4f 100644
--- a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
+++ b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
@@ -21,7 +21,8 @@ public static class TextAnalytics
             Desc = Data.TextTransform.Summary, 
             UserName = Data.TextTransform.UserName, 
             ShortName = Data.TextTransform.LoaderSignature,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""TextTransform""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""TextTransform""]/*' />" ,
+                                 @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""TextTransform""]/*' />"})]
         public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TextTransform", input);
@@ -37,7 +38,8 @@ public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env,
             Desc = Data.DelimitedTokenizeTransform.Summary,
             UserName = Data.DelimitedTokenizeTransform.UserName, 
             ShortName = Data.DelimitedTokenizeTransform.LoaderSignature,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordTokenizer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""WordTokenizer""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""WordTokenizer""]/*' />"})]
         public static CommonOutputs.TransformOutput DelimitedTokenizeTransform(IHostEnvironment env, DelimitedTokenizeTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "DelimitedTokenizeTransform", input);
@@ -84,7 +86,8 @@ public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env,
             Desc = "Uses a pretrained sentiment model to score input strings", 
             UserName = SentimentAnalyzingTransform.UserName, 
             ShortName = SentimentAnalyzingTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""SentimentAnalyzer""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""SentimentAnalyzer""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""SentimentAnalyzer""]/*' />"})]
         public static CommonOutputs.TransformOutput AnalyzeSentiment(IHostEnvironment env, SentimentAnalyzingTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "SentimentAnalyzer", input);
@@ -119,7 +122,8 @@ public static CommonOutputs.TransformOutput CharTokenize(IHostEnvironment env, C
             Desc = LdaTransform.Summary, 
             UserName = LdaTransform.UserName, 
             ShortName = LdaTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""LightLDA""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name=""LightLDA""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name=""LightLDA""]/*' />" })]
         public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, LdaTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/HashJoinTransform.cs b/src/Microsoft.ML.Transforms/HashJoinTransform.cs
index f5fb3a6f71..afd5fb7aa3 100644
--- a/src/Microsoft.ML.Transforms/HashJoinTransform.cs
+++ b/src/Microsoft.ML.Transforms/HashJoinTransform.cs
@@ -679,7 +679,8 @@ public static class HashJoin
             Desc = HashJoinTransform.Summary, 
             UserName = HashJoinTransform.UserName, 
             ShortName = HashJoinTransform.RegistrationName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""HashJoin""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""HashJoin""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""HashJoin""]/*' />"})]
         public static CommonOutputs.TransformOutput Apply(IHostEnvironment env, HashJoinTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/NAHandling.cs b/src/Microsoft.ML.Transforms/NAHandling.cs
index 992b2998d0..7190291b16 100644
--- a/src/Microsoft.ML.Transforms/NAHandling.cs
+++ b/src/Microsoft.ML.Transforms/NAHandling.cs
@@ -15,7 +15,8 @@ public static class NAHandling
             Desc = NADropTransform.Summary,
             UserName = NADropTransform.FriendlyName, 
             ShortName = NADropTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NADrop""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NADrop""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""NADrop""]/*' />" })]
         public static CommonOutputs.TransformOutput Drop(IHostEnvironment env, NADropTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, NADropTransform.ShortName, input);
@@ -31,7 +32,8 @@ public static CommonOutputs.TransformOutput Drop(IHostEnvironment env, NADropTra
             Desc = NAFilter.Summary, 
             UserName = NAFilter.FriendlyName, 
             ShortName = NAFilter.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""NAFilter""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""NAFilter""]/*' />",  
+                                 @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/example[@name=""NAFilter""]/*' />"})]
         public static CommonOutputs.TransformOutput Filter(IHostEnvironment env, NAFilter.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, NAFilter.ShortName, input);
@@ -47,7 +49,8 @@ public static CommonOutputs.TransformOutput Filter(IHostEnvironment env, NAFilte
             Desc = NAHandleTransform.Summary, 
             UserName = NAHandleTransform.FriendlyName, 
             ShortName = NAHandleTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""NAFilter""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""NAHandle""]/*' />",
+                                 @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/example[@name=""NAHandle""]/*' />" })]
         public static CommonOutputs.TransformOutput Handle(IHostEnvironment env, NAHandleTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAHandle", input);
@@ -63,7 +66,8 @@ public static CommonOutputs.TransformOutput Handle(IHostEnvironment env, NAHandl
             Desc = NAIndicatorTransform.Summary, 
             UserName = NAIndicatorTransform.FriendlyName, 
             ShortName = NAIndicatorTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NAIndicator""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NAIndicator""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""NAIndicator""]/*' />"})]
         public static CommonOutputs.TransformOutput Indicator(IHostEnvironment env, NAIndicatorTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAIndicator", input);
@@ -79,7 +83,8 @@ public static CommonOutputs.TransformOutput Indicator(IHostEnvironment env, NAIn
             Desc = NAReplaceTransform.Summary, 
             UserName = NAReplaceTransform.FriendlyName, 
             ShortName = NAReplaceTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NAReplace""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""NAReplace""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""NAReplace""]/*' />"})]
         public static CommonOutputs.TransformOutput Replace(IHostEnvironment env, NAReplaceTransform.Arguments input)
         {
             var h = EntryPointUtils.CheckArgsAndCreateHost(env, "NAReplace", input);
diff --git a/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs b/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
index d03245c35e..d1423bf9bc 100644
--- a/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
+++ b/src/Microsoft.ML.Transforms/OptionalColumnTransform.cs
@@ -464,7 +464,8 @@ private Delegate MakeGetterVec<T>(int length)
             Name = "Transforms.OptionalColumnCreator", 
             UserName = UserName, 
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""OptionalColumnTransform""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""OptionalColumnTransform""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""OptionalColumnTransform""]/*' />"})]
 
         public static CommonOutputs.TransformOutput MakeOptional(IHostEnvironment env, Arguments input)
         {
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
index 569f083da3..131f105115 100644
--- a/src/Microsoft.ML.Transforms/Text/doc.xml
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -22,22 +22,24 @@
         It converts a collection of text columns to a matrix of token  ngrams/skip-grams counts.
         Features are made of (word/character) n-grams/skip-grams​ and the number of features are equal to the vocabulary size found by analyzing the data.
       </remarks>
+    </member>
+    <example name="TextTransform">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new TextFeaturizer(&quot;Features&quot;, &quot;SentimentText&quot;)
           {
-            KeepDiacritics = false,
-            KeepPunctuations = false,
-            TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
-            OutputTokens = true,
-            StopWordsRemover = new PredefinedStopWordsRemover(),
-            VectorNormalizer = TextTransformTextNormKind.L2,
-            CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
-            WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
+          KeepDiacritics = false,
+          KeepPunctuations = false,
+          TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
+          OutputTokens = true,
+          StopWordsRemover = new PredefinedStopWordsRemover(),
+          VectorNormalizer = TextTransformTextNormKind.L2,
+          CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
+          WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
           });
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="WordTokenizer">
       <summary>
@@ -51,12 +53,14 @@
         Empty strings and strings containing only spaces are dropped.
         This transform is not typically used on its own, but it is one of the transforms composing the Text Featurizer.
       </remarks>
+    </member>
+    <example name="WordTokenizer">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add( new WordTokenizer(&quot;TextColumn&quot;){ TermSeparators = &quot;&apos; &apos;, &apos;\t&apos;, &apos;;&apos;&quot;  } );
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="NgramTranslator">
       <summary>
@@ -69,12 +73,17 @@
         Embedding ngrams in a vector space allows their contents to be compared in an efficient manner. 
         The slot values in the vector can be weighted by the following factors:
         <list>
-          <item><description>term frequency - The number of occurrences of the slot in the text</description></item>
-          <item><description>
-              inverse document frequency - A ratio (the logarithm of inverse relative slot frequency)
-              that measures the information a slot provides by determining how common or rare it is across the entire text.
-            </description></item>
-            <item><description>term frequency-inverse document frequency - the product term frequency and the inverse document frequency.</description></item>
+          <item>term frequency
+            <description> the number of occurrences of the slot in the text</description>
+          </item>
+          <item>inverse document frequency
+            <description> a ratio (the logarithm of inverse relative slot frequency)
+              that measures the information a slot provides by determining how common or rare it is across the entire text.</description>
+          </item>
+            <item>
+              term frequency-inverse document frequency
+              <description> the product term frequency and the inverse document frequency.</description>
+            </item>
         </list>
         This transform is not typically used on its own, but it is one of the transforms composing the <see cref="Microsoft.ML.Transforms.TextFeaturizer">Text Featurizer</see> .
       </remarks>
@@ -83,7 +92,7 @@
       <seealso cref="Microsoft.ML.Transforms.TextFeaturizer"/>
       <seealso cref="Microsoft.ML.Transforms.CharacterTokenizer"/>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new NGramTranslator(&quot;TextColumn&quot;){ Weighting=NgramTransformWeightingCriteria.TfIdf  } );
       </code>
       </example>
@@ -103,16 +112,18 @@
           (where 0 is a negative sentiment and 1 is a positive sentiment).</para> 
           <para>Currently it supports only English.</para>
       </remarks>
-      <example>
-        <code>
+    </member>
+    <exaple>
+      <example name="SentimentAnalyzer">
+        <code language="csharp">
           pipeline.Add(new SentimentAnalyzer(){ Source = &quot;TextColumn&quot; }  );
         </code>
       </example>
-    </member>
+    </exaple>
 
     <member name="CharacterTokenizer">
       <summary>
-        This transform breaks text into individual tokens, each consisting of individual character.
+        This transform breaks text into individual tokens, each consisting of individual characters.
       </summary>
       <remarks>
       This transform is not typically used on its own, but it is one of the transforms composing the 
@@ -123,7 +134,7 @@
       <seealso cref="Microsoft.ML.Transforms.NGramTranslator"/>
       <seealso cref="Microsoft.ML.Transforms.TextFeaturizer"/>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new CharacterTokenizer("TextCol1" , "TextCol2" ) );
         </code>
       </example>
@@ -154,12 +165,14 @@
           </list>
         </para>
       </remarks>
+    </member>
+    <example name="LightLDA">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new LightLda(("InTextCol" , "OutTextCol")));
         </code>
       </example>
-    </member>
+    </example>
 
   </members>
 </doc>
diff --git a/src/Microsoft.ML.Transforms/UngroupTransform.cs b/src/Microsoft.ML.Transforms/UngroupTransform.cs
index 387320da81..0964227024 100644
--- a/src/Microsoft.ML.Transforms/UngroupTransform.cs
+++ b/src/Microsoft.ML.Transforms/UngroupTransform.cs
@@ -631,7 +631,8 @@ public static partial class GroupingOperations
             Desc = UngroupTransform.Summary, 
             UserName = UngroupTransform.UserName, 
             ShortName = UngroupTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""Ungroup""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""Ungroup""]/*' />",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""Ungroup""]/*' />"})]
         public static CommonOutputs.TransformOutput Ungroup(IHostEnvironment env, UngroupTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index a3230b82c2..f752009013 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -11,12 +11,14 @@
         value and using the hash as an index in the bag.
         If the input column is a vector, a single indicator bag is returned for it.
       </remarks>
+    </member>
+    <example name="CategoricalHashOneHotVectorizer">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new CategoricalHashOneHotVectorizer(&quot;Text1&quot;) { HashBits = 10, Seed = 314489979, OutputKind = CategoricalTransformOutputKind.Bag });
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="CategoricalOneHotVectorizer">
       <summary>
@@ -39,14 +41,16 @@
         for Ind they are concatenated and for Bag they are added.
         When the source column is a singleton, the Ind and Bag options are identical.</para>
       </remarks>
+    </member>
+    <example name="CategoricalOneHotVectorizer">
       <example>
-        An example of how to add the CategoricalOneHotVectorizer transform to a pipeline with two text column 
+        An example of how to add the CategoricalOneHotVectorizer transform to a pipeline with two text column
         features named &quot;Text1&quot; and &quot;Text2&quot;.
-        <code>
+        <code language="csharp">
           pipeline.Add(new CategoricalOneHotVectorizer(&quot;Text1&quot;, &quot;Text1&quot;));
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="CountFeatureSelection">
       <summary>
@@ -56,16 +60,18 @@
         <para>
           This transform uses a set of aggregators to count the number of non-default values for each slot and
           instantiates a <see cref="Microsoft.ML.Runtime.Data.DropSlotsTransform"/> to actually drop the slots.
-          This transform is useful when applied together with a CategoricalHashOneHotVectorizer. 
+          This transform is useful when applied together with a <see cref="T:Microsoft.ML.CategoricalHashOneHotVectorizer"/>. 
           The count feature selection can remove those features generated by the hash transform that have no data in the examples.
         </para>
       </remarks>
-      <example>
-        <code>
+    </member>
+    <example name="CountFeatureSelection">
+       <example>
+        <code language="csharp">
           pipeline.Add(new FeatureSelectorByCount() { Column = new[]{ &quot;Feature1&quot; }, Count = 2 });
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="MutualInformationFeatureSelection">
       <summary>
@@ -85,12 +91,14 @@
         It keeps the top SlotsInOutput features with the largest mutual information with the label.
         </para>
       </remarks>
+    </member>
+    <example name="MutualInformationFeatureSelection">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new FeatureSelectorByMutualInformation() { Column = new[]{ &quot;Feature1&quot; }, SlotsInOutput = 6 });
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="MutualInformationFeatureSelection">
       <summary>
@@ -112,13 +120,13 @@
         </para>
       </remarks>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new FeatureSelectorByMutualInformation() { Column = new[]{ &quot;Feature1&quot;}, SlotsInOutput = 6 });
         </code>
       </example>
     </member>
 
-    <member name="OptionalColumnTransform">
+    <member>
       <summary>
         If the user wish to create additional columns with a particular type and default values, 
         or replicated the values from one column to another, changing their type, they can do so using this transform. 
@@ -127,12 +135,14 @@
       </summary>
       <remarks>        
       </remarks>
+    </member>
+    <example name="OptionalColumnTransform">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new OptionalColumnCreator() { Column = new[]{ &quot;OptColumn&quot;} });
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="HashJoin">
       <summary>
@@ -141,28 +151,32 @@
       </summary>
       <remarks>
         This transform can be helpful for ranking and cross-validation. In the case of ranking, where the GroupIdColumn column is required,
-        and needs to be of a key type you can use the CategoricalHashOneHotVectorizer to hash the text value of a single GroupID column into a key value.
+        and needs to be of a key type you can use the <see cref="T:Microsoft.ML.Transforms.CategoricalHashOneHotVectorizer" /> to hash the text value of a single GroupID column into a key value.
         If the GroupID is the combination of the values from multiple columns, you can use the HashConverter to hash multiple text columns into one key column. 
         Similarly with CrossValidator and the StratificationColumn. 
       </remarks>
+    </member>
+    <example name="HashJoin">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new HashConverter(&quot;Column1&quot;, &quot;Column2&quot;));
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="NADrop">
       <summary>
         Removes missing values from vector type columns.
       </summary>
-      <seealso cref="Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+      <seealso cref="T:Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+    </member>
+    <example>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new MissingValuesDropper(&quot;Column1&quot;));
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="NAIndicator">
       <summary>
@@ -170,12 +184,14 @@
         creating output columns that indicate, through the true/false booleans whether the row has a missing value.
       </summary>
       <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+    </member>
+    <example name="NAIndicator">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new MissingValueIndicator(&quot;Column1&quot;));
         </code>
       </example>
-    </member>
+    </example>
     
     <member name="NAReplace">
       <summary>
@@ -189,12 +205,57 @@
         Imputation modes are supported for vectors both by slot and across all slots.
       </remarks>
       <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+    </member>
+    <example name="NAReplace">
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new MissingValueSubstitutor(&quot;FeatureCol&quot;){ ReplacementKind = NAReplaceTransformReplacementKind.Mean });
         </code>
       </example>
+    </example>
+
+    <member name="NAHandle">
+      <summary>
+        Handle missing values by replacing them with either the default value or the indicated value.
+      </summary>
+      <remarks>
+        This transform handles missing values in the input columns. For each input column, it creates an output column
+        where the missing values are replaced by one of these specified values:
+        <list type="bullet">
+          <item>
+            <description>The default value of the appropriate type.</description>
+          </item>
+          <item>
+            <description>The mean value of the appropriate type.</description>
+          </item>
+          <item>
+            <description>The max value of the appropriate type.</description>
+          </item>
+          <item>
+            <description>The min value of the appropriate type.</description>
+          </item>
+        </list>
+        <para>The last three work only for numeric/TimeSpan/DateTime kind columns.</para>
+        <para>
+          The output column can also optionally include an indicator vector for which slots were missing in the input column.
+          This can be done only when the indicator vector type can be converted to the input column type, i.e. only for numeric columns.
+        </para>
+        <para>
+          When computing the mean/max/min value, there is also an option to compute it over the whole column instead of per slot.
+          This option has a default value of true for variable length vectors, and false for known length vectors.
+          It can be changed to true for known length vectors, but it results in an error if changed to false for variable length vectors.
+        </para>
+      </remarks>
+      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"/>
+      <seealso cref="T:Microsoft.ML.Data.DataKind"/>
     </member>
+    <example name="NAHandle">
+      <example>
+        <code language="csharp">
+          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;) { ReplaceWith  = NAHandleTransformReplacementKind.Mean });
+        </code>
+      </example>
+    </example>
     
     <member name="LpNormalize">
       <summary>
@@ -209,7 +270,7 @@
       </remarks>
       <seealso cref=" Microsoft.ML.Transforms.GcNormalize"></seealso>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new LpNormalizer("FeatureCol"){ NormKind = LpNormNormalizerTransformNormalizerKind.L1Norm});
         </code>
       </example>
@@ -227,13 +288,13 @@
       </remarks>
       <seealso cref=" Microsoft.ML.Transforms.LpNormalizer"></seealso>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new GlobalContrastNormalizer(&quot;FeatureCol&quot;){ SubMean= false });
         </code>
       </example>
     </member>
     
-  <member name="Ungroup">
+  <member>
       <summary>
         Un-groups vector columns into sequences of rows, inverse of Group transform.
        </summary>
@@ -266,35 +327,14 @@
         except slot names is preserved.
         </para>
       </remarks>
-      <example>
-        <code>
-          pipeline.Add(new Segregator(){ Column = new[]{&quot;Column1&quot; }, Mode = UngroupTransformUngroupMode.First} );
-        </code>
-      </example>
     </member>
-    
-    <member name="TextToKey">
-      <summary>
-       Converts input values (words, numbers, etc.) to index in a dictionary.
-      </summary>
-      <remarks>
-      The TextToKeyConverter transform builds up term vocabularies (dictionaries).
-      The TextToKey Converter and the <see cref="Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys. 
-      If multiple columns are used, each column builds/uses exactly one vocabulary (dictionary).
-      The output columns are KeyType-valued.
-      The Key value is the one-based index of the item in the dictionary.
-      If the key is not found in the dictionary, it is assigned the missing value indicator.
-      This dictionary mapping values to keys is most commonly learnt from the unique values in input data, 
-      but can be defined through other means: either with the mapping defined directly on the command line, or as loaded from an external file.
-      </remarks>
-      <seealso cref="Microsoft.ML.Transforms.HashConverter"/>
-      <seealso cref="Microsoft.ML.Transforms.KeyToTextConverter"/>
+    <example  name="Ungroup">
       <example>
-        <code>
-          pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;)){ Sort = TermTransformSortOrder.Occurrence });
+        <code language="csharp">
+          pipeline.Add(new Segregator(){ Column = new[]{&quot;Column1&quot; }, Mode = UngroupTransformUngroupMode.First} );
         </code>
       </example>
-    </member>
+    </example>
 
     <member name="KeyToText">
       <summary>
@@ -312,7 +352,7 @@
       <seealso cref="Microsoft.ML.Transforms.HashConverter"/>
       <seealso cref="Microsoft.ML.Transforms.TextToKeyConverter"/>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new KeyToTextConverter((&quot;InColumn&quot;, &quot;OutColumn&quot; )));
         </code>
       </example>
@@ -334,7 +374,7 @@
       </remarks>
        <seealso cref="Microsoft.ML.Transforms.Segregator"/>
       <example>
-        <code>
+        <code language="csharp">
           pipeline.Add(new CombinerByContiguousGroupId(){ GroupKey = new []{"Key1", "Key2" } } );
         </code>
       </example>

From 6f45e6982f255e508889acd560ff0d0f1678e99e Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Tue, 17 Jul 2018 11:07:02 -0700
Subject: [PATCH 11/14] small fixes

---
 .../LogisticRegression/LogisticRegression.cs  |  2 +-
 .../MulticlassLogisticRegression.cs           |  2 +-
 .../CategoricalTransform.cs                   |  4 +-
 src/Microsoft.ML.Transforms/doc.xml           |  6 +--
 src/Microsoft.ML/CSharpApi.cs                 | 45 +++++++++++++++++--
 5 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
index 1d63d66d2d..16ecffe572 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -393,7 +393,7 @@ public override ParameterMixingCalibratedPredictor CreatePredictor()
             UserName = UserNameValue,
             ShortName = ShortName,
             XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/example[@name=""LogisticRegressionBinaryClassifier""]/*' />"})]
                             
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, Arguments input)
         {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
index 8067266577..df8eb44f14 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/MulticlassLogisticRegression.cs
@@ -967,7 +967,7 @@ public partial class LogisticRegression
             UserName = MulticlassLogisticRegression.UserNameValue,
             ShortName = MulticlassLogisticRegression.ShortName,
             XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name=""LBFGS""]/*' />",
-                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/example[@name=""LogisticRegressionClassifier""]/*' />" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, MulticlassLogisticRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/CategoricalTransform.cs b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
index 2d4e9e1807..420db1b731 100644
--- a/src/Microsoft.ML.Transforms/CategoricalTransform.cs
+++ b/src/Microsoft.ML.Transforms/CategoricalTransform.cs
@@ -278,8 +278,8 @@ public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment en
         [TlcModule.EntryPoint(Name = "Transforms.TextToKeyConverter",
             Desc = TermTransform.Summary, 
             UserName = TermTransform.UserName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""TextToKey""]/*' />",
-                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""TextToKey""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name=""TextToKey""]/*' />",
+                                 @"<include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/example[@name=""TextToKey""]/*' />" })]
         public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, TermTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index f752009013..5d8b8649ad 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -126,7 +126,7 @@
       </example>
     </member>
 
-    <member>
+    <member name="OptionalColumnTransform">
       <summary>
         If the user wish to create additional columns with a particular type and default values, 
         or replicated the values from one column to another, changing their type, they can do so using this transform. 
@@ -294,7 +294,7 @@
       </example>
     </member>
     
-  <member>
+  <member name="Ungroup">
       <summary>
         Un-groups vector columns into sequences of rows, inverse of Group transform.
        </summary>
@@ -328,7 +328,7 @@
         </para>
       </remarks>
     </member>
-    <example  name="Ungroup">
+    <example name="Ungroup">
       <example>
         <code language="csharp">
           pipeline.Add(new Segregator(){ Column = new[]{&quot;Column1&quot; }, Mode = UngroupTransformUngroupMode.First} );
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index d43ff928f7..b6ce7778e1 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -4134,6 +4134,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name="AP"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/example[@name="AP"]/*' />
         public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4638,6 +4639,7 @@ public enum Bundle : byte
 
 
         /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name="FastForestBinaryClassifier"]/*' />
         public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -4929,6 +4931,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastForest"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name="FastForestRegressor"]/*' />
         public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5216,6 +5219,7 @@ public enum BoostedTreeArgsOptimizationAlgorithmType
 
 
         /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name="FastTreeBinaryClassifier"]/*' />
         public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -5605,6 +5609,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name="FastTreeRanker"]/*' />
         public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6029,6 +6034,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="FastTree"]/*' />
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name="FastTreeRegressor"]/*' />
         public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -6802,6 +6808,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/member[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml' path='doc/members/example[@name="FieldAwareFactorizationMachineBinaryClassifier"]/*' />
         public sealed partial class FieldAwareFactorizationMachineBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7229,6 +7236,7 @@ public enum KMeansPlusPlusTrainerInitAlgorithm
 
 
         /// <include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/member[@name="KMeans++"]/*' />
+        /// <include file='../Microsoft.ML.KMeansClustering/doc.xml' path='doc/members/example[@name="KMeans++"]/*' />
         public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7345,6 +7353,7 @@ public enum LightGbmArgumentsEvalMetricType
 
 
         /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name="LightGbmBinaryClassifier"]/*' />
         public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7548,6 +7557,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name="LightGbmClassifier"]/*' />
         public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7751,6 +7761,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name="LightGbmRanker"]/*' />
         public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -7954,6 +7965,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
+        /// <include file='../Microsoft.ML.LightGBM/doc.xml' path='doc/members/example[@name="LightGbmRegressor"]/*' />
         public sealed partial class LightGbmRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8292,7 +8304,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/example[@name="LogisticRegressionBinaryClassifier"]/*' />
         public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8441,7 +8453,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/member[@name="LBFGS"]/*' />
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='docs/members/example[@name="LogisticRegressionClassifier"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/LogisticRegression/doc.xml' path='doc/members/example[@name="LogisticRegressionClassifier"]/*' />
         public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8590,6 +8602,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="MultiClassNaiveBayesTrainer"]'/>
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/example[@name="MultiClassNaiveBayesTrainer"]'/>
         public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8661,6 +8674,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/member[@name="OGD"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/Online/doc.xml' path='doc/members/example[@name="OGD"]/*' />
         public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8814,6 +8828,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/example[@name="PcaAnomalyDetector"]/*' />
         public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8908,6 +8923,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/member[@name="PoissonRegression"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml' path='doc/members/example[@name="PoissonRegression"]/*' />
         public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9051,6 +9067,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/example[@name="StochasticDualCoordinateAscentBinaryClassifier"]/*'/>
         public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9190,6 +9207,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/example[@name="StochasticDualCoordinateAscentClassifier"]/*' />
         public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9313,6 +9331,7 @@ namespace Trainers
     {
 
         /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/member[@name="SDCA"]/*' />
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/doc.xml' path='doc/members/example[@name="StochasticDualCoordinateAscentRegressor"]/*' />
         public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -9916,6 +9935,7 @@ public sealed partial class CategoricalHashTransformColumn : OneToOneColumn<Cate
         }
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CategoricalHashOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="CategoricalHashOneHotVectorizer"]/*' />
         public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -10089,6 +10109,7 @@ public sealed partial class CategoricalTransformColumn : OneToOneColumn<Categori
         }
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CategoricalOneHotVectorizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="CategoricalOneHotVectorizer"]/*' />
         public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11397,6 +11418,7 @@ namespace Transforms
     {
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CountFeatureSelection"]'/>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="CountFeatureSelection"]'/>
         public sealed partial class FeatureSelectorByCount : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11465,6 +11487,7 @@ namespace Transforms
     {
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]'/>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="MutualInformationFeatureSelection"]'/>
         public sealed partial class FeatureSelectorByMutualInformation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11729,6 +11752,7 @@ public sealed partial class HashJoinTransformColumn : OneToOneColumn<HashJoinTra
         }
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="HashJoin"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="HashJoin"]/*' />
         public sealed partial class HashConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -12302,6 +12326,7 @@ public sealed partial class LdaTransformColumn : OneToOneColumn<LdaTransformColu
         }
 
         /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="LightLDA"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="LightLDA"]/*' />
         public sealed partial class LightLda : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13053,7 +13078,8 @@ public sealed partial class NAHandleTransformColumn : OneToOneColumn<NAHandleTra
 
         }
 
-        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name="NAFilter"]/*' />
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name="NAHandle"]/*' />
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/example[@name="NAHandle"]/*' />
         public sealed partial class MissingValueHandler : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13186,6 +13212,7 @@ public sealed partial class NAIndicatorTransformColumn : OneToOneColumn<NAIndica
         }
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="NAIndicator"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="NAIndicator"]/*' />
         public sealed partial class MissingValueIndicator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13303,6 +13330,7 @@ public sealed partial class NADropTransformColumn : OneToOneColumn<NADropTransfo
         }
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="NADrop"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="NADrop"]/*' />
         public sealed partial class MissingValuesDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13406,6 +13434,7 @@ namespace Transforms
     {
 
         /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name="NAFilter"]/*' />
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/example[@name="NAFilter"]/*' />
         public sealed partial class MissingValuesRowDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13512,6 +13541,7 @@ public sealed partial class NAReplaceTransformColumn : OneToOneColumn<NAReplaceT
         }
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="NAReplace"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="NAReplace"]/*' />
         public sealed partial class MissingValueSubstitutor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13886,6 +13916,7 @@ namespace Transforms
     {
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="OptionalColumnTransform"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="OptionalColumnTransform"]/*' />
         public sealed partial class OptionalColumnCreator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -13988,6 +14019,7 @@ public sealed partial class PcaTransformColumn : OneToOneColumn<PcaTransformColu
         }
 
         /// <include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/member[@name="PCA"]/*' />
+        /// <include file='../Microsoft.ML.PCA/doc.xml' path='doc/members/example[@name="PcaCalculator"]/*' />
         public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14669,6 +14701,7 @@ public enum UngroupTransformUngroupMode
 
 
         /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="Ungroup"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="Ungroup"]/*' />
         public sealed partial class Segregator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14737,6 +14770,7 @@ namespace Transforms
     {
 
         /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="SentimentAnalyzer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="SentimentAnalyzer"]/*' />
         public sealed partial class SentimentAnalyzer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -14994,6 +15028,7 @@ public sealed partial class TermLoaderArguments
         }
 
         /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="TextTransform"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="TextTransform"]/*' />
         public sealed partial class TextFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -15128,7 +15163,8 @@ public TextFeaturizerPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="TextToKey"]/*' />
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/member[@name="TextToKey"]/*' />
+        /// <include file='../Microsoft.ML.Data/Transforms/doc.xml' path='doc/members/example[@name="TextToKey"]/*' />
         public sealed partial class TextToKeyConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -15418,6 +15454,7 @@ public sealed partial class DelimitedTokenizeTransformColumn : OneToOneColumn<De
         }
 
         /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/member[@name="WordTokenizer"]/*' />
+        /// <include file='../Microsoft.ML.Transforms/Text/doc.xml' path='doc/members/example[@name="WordTokenizer"]/*' />
         public sealed partial class WordTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 

From 7f5ac2bd795999e6a21a457e5d3391246dc797ca Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 18 Jul 2018 11:00:35 -0700
Subject: [PATCH 12/14] addressing code comments

---
 src/Microsoft.ML.Data/Transforms/doc.xml      |   9 +-
 src/Microsoft.ML.FastTree/doc.xml             |   4 +-
 src/Microsoft.ML.PCA/doc.xml                  |   2 +-
 .../FactorizationMachine/doc.xml              |   6 +-
 .../Standard/MultiClass/Pkpd.cs               |   4 +-
 .../Standard/MultiClass/doc.xml               |  31 ++---
 .../Standard/Online/doc.xml                   |   4 +-
 .../Standard/PoissonRegression/doc.xml        |   2 +-
 .../Standard/doc.xml                          |  10 +-
 src/Microsoft.ML.Transforms/Text/doc.xml      |  42 ++++---
 src/Microsoft.ML.Transforms/doc.xml           | 108 ++++++++++--------
 11 files changed, 124 insertions(+), 98 deletions(-)

diff --git a/src/Microsoft.ML.Data/Transforms/doc.xml b/src/Microsoft.ML.Data/Transforms/doc.xml
index 808595147e..a3d4ba9f5e 100644
--- a/src/Microsoft.ML.Data/Transforms/doc.xml
+++ b/src/Microsoft.ML.Data/Transforms/doc.xml
@@ -6,7 +6,7 @@
         Removes missing values from vector type columns.
       </summary>
       <remarks>
-        This transform emoves the entire row if any of the input columns have a missing value in that row.
+        This transform removes the entire row if any of the input columns have a missing value in that row.
         This preprocessing is required for many ML algorithms that cannot work with missing values.
         Useful if any missing entry invalidates the entire row.
         If the <see cref="Microsoft.ML.Runtime.Data.NAFilter.Defaults.Complement"/> is set to true, this transform would do the exact opposite,
@@ -29,7 +29,7 @@
       <remarks>
         The TextToKeyConverter transform builds up term vocabularies (dictionaries).
         The TextToKey Converter and the <see cref="T:Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys.
-        If multiple columns are used, each column builds/uses exactly one vocabulary (dictionary).
+        If multiple columns are used, each column builds/uses exactly one vocabulary.
         The output columns are KeyType-valued.
         The Key value is the one-based index of the item in the dictionary.
         If the key is not found in the dictionary, it is assigned the missing value indicator.
@@ -42,7 +42,10 @@
     <example name="TextToKey">
       <example>
         <code language="csharp">
-          pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;)){ Sort = TermTransformSortOrder.Occurrence });
+          pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;))
+          { 
+            Sort = TermTransformSortOrder.Occurrence 
+          });
         </code>
       </example>
     </example>
diff --git a/src/Microsoft.ML.FastTree/doc.xml b/src/Microsoft.ML.FastTree/doc.xml
index 170ae8bc9d..8678654182 100644
--- a/src/Microsoft.ML.FastTree/doc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -24,7 +24,7 @@
           The output of the ensemble produced by MART on a given instance is the sum of the tree outputs.
         </para>
         <list type='bullet'>
-          <item><description> In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</description></item>
+          <item><description>In case of a binary classification problem, the output is converted to a probability by using some form of calibration.</description></item>
           <item><description>In case of a regression problem, the output is the predicted value of the function.</description></item>
           <item><description>In case of a ranking problem, the instances are ordered by the output value of the ensemble.</description></item>
         </list>
@@ -163,7 +163,7 @@
         There are a number of famous or popular examples of this technique:
         <list>
           <item><description>A deep neural net trained on the ImageNet dataset, with the last layer removed, is commonly used to compute the 'projection' of the image into the 'semantic feature space'.
-            It is observed that the Euclidian distance in this space often correlates with the 'semantic similarity': that is, all pictures of pizza are located close together,
+            It is observed that the Euclidean distance in this space often correlates with the 'semantic similarity': that is, all pictures of pizza are located close together,
             and far away from pictures of kittens. </description></item>
           <item><description>A matrix factorization and/or LDA model is also often used to extract the 'latent topics' or 'latent features' associated with users and items.</description></item>
           <item><description>The weights of the linear model are often used as a crude indicator of 'feature importance'. At the very minimum, the 0-weight features are not needed by the model,
diff --git a/src/Microsoft.ML.PCA/doc.xml b/src/Microsoft.ML.PCA/doc.xml
index 66bf06cc30..c4f0be7758 100644
--- a/src/Microsoft.ML.PCA/doc.xml
+++ b/src/Microsoft.ML.PCA/doc.xml
@@ -4,7 +4,7 @@
     
     <member name="PCA">
       <summary>
-        PCA is a dimensionality-reduction transform which computes the projection of the feature vector to onto a low-rank subspace. 
+        PCA is a dimensionality-reduction transform which computes the projection of the feature vector onto a low-rank subspace. 
       </summary>
       <remarks>
       <a href='https://en.wikipedia.org/wiki/Principal_component_analysis'>Principle Component Analysis (PCA)</a> is a dimensionality-reduction algorithm which computes the projection of the feature vector to onto a low-rank subspace.
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
index 3a2e5f72ad..f18bf60990 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
@@ -30,7 +30,11 @@
     <example name="FieldAwareFactorizationMachineBinaryClassifier">
       <example>
         <code language="csharp">
-          pipeline.Add(new FieldAwareFactorizationMachineBinaryClassifier(){ LearningRate = 0.5f, Iter=2 });
+          pipeline.Add(new FieldAwareFactorizationMachineBinaryClassifier
+          { 
+            LearningRate = 0.5f, 
+            Iter=2 
+          });
         </code>
       </example>
     </example>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
index 1e26987cdb..c1488fc6ff 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs
@@ -37,7 +37,7 @@ namespace Microsoft.ML.Runtime.Learners
     /// 2, we would train classifiers for the pairs (0,0), (0,1), (0,2), (1,1), (1,2),
     /// and(2,2). For each binary classifier, an input data point is considered a
     /// positive example if it is in either of the two classes in the pair, and a
-    /// negative example otherwise.At prediction time, the probabilities for each
+    /// negative example otherwise. At prediction time, the probabilities for each
     /// pair of classes is considered as the probability of being in either class of
     /// the pair given the data, and the final predictive probabilities out of that
     /// per class are calculated given the probability that an example is in any given
@@ -51,7 +51,7 @@ namespace Microsoft.ML.Runtime.Learners
     /// practical due to, usually, memory constraints.For example, while a multiclass
     /// logistic regression is a more principled way to solve a multiclass problem, it
     /// requires that the learner store a lot more intermediate state in the form of
-    /// L-BFGS history for all classes * simultaneously*, rather than just one-by-one
+    /// L-BFGS history for all classes *simultaneously*, rather than just one-by-one
     /// as would be needed for OVA.
     /// </summary>
     public sealed class Pkpd : MetaMulticlassTrainer<PkpdPredictor, Pkpd.Arguments>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
index e4e6580a19..8d2af374db 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml
@@ -13,10 +13,6 @@
         It assumes independence among the presence of features in a class even though they may be dependent on each other.
         This multi-class trainer accepts binary feature values of type float, i.e., feature values are either true or false.
         Specifically a feature value greater than zero is treated as true.
-        These learner will request normalization from the data pipeline if the
-        classifier indicates it would benefit from it. Note that even if the
-        classifier indicates that it does not need caching, OVA will always
-        request caching, as it will be performing multiple passes over the data set.
       </remarks>
       <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'/>
       <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'/>
@@ -26,25 +22,30 @@
     <example name="MultiClassNaiveBayesTrainer">
       <example>
         <code language="csharp">
-          pipeline.Add(new NaiveBayesClassifier(){ NormalizeFeatures = NormalizeOption.Auto, Caching = CachingOptions.Memory });
+          pipeline.Add(new NaiveBayesClassifier
+            { 
+              NormalizeFeatures = NormalizeOption.Auto,
+              Caching = CachingOptions.Memory 
+            });
         </code>
       </example>
     </example>
 
     <member name="OVA">
       <summary>
-        In this strategy, a binary classification algorithm is used to train one classifier for each class, which distinguishes that class from all other classes.
-        Prediction is then performed by running these binary classifiers, and choosing the prediction with the highest confidence score.
+        Trains a one-versus-all multi-class classifier on top of the specified binary classifier.
       </summary>
       <remarks>
-        <para>This algorithm can be treated as a wrapper for all the binary classifiers in ML.NET. 
-        A few binary classifiers already have implementation for multi-class problems, 
-        thus users can choose either one depending on the context. 
-        </para>
-        <para>
-          The OVA version of a binary classifier, such as wrapping a LightGbmBinaryClassifier ,
-          can be different from LightGbmClassifier, which develops a multi-class classifier directly. 
-        </para>
+        <para>In this strategy, a binary classification algorithm is used to train one classifier for each class, which distinguishes that class from all other classes.
+        Prediction is then performed by running these binary classifiers, and choosing the prediction with the highest confidence score.</para>
+        <para>This algorithm can be used with any of the binary classifiers in ML.NET.
+        A few binary classifiers already have implementation for multi-class problems,
+        thus users can choose either one depending on the context.</para>
+        <para>The OVA version of a binary classifier, such as wrapping a LightGbmBinaryClassifier ,
+        can be different from LightGbmClassifier, which develops a multi-class classifier directly.</para>
+        <para>Note that even if the classifier indicates that it does not need caching, OneVersusAll will always
+        request caching, as it will be performing multiple passes over the data set.
+        These learner will request normalization from the data pipeline if the classifier indicates it would benefit from it.</para>
       </remarks>
       <seealso cref='Microsoft.ML.Trainers.LogisticRegressionClassifier'/>
       <seealso cref='Microsoft.ML.Trainers.LightGbmClassifier'/>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
index 08d1c161f9..0ace721221 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
@@ -16,7 +16,7 @@
     <example>
       <example name="OGD">
         <code language="csharp">
-          new OnlineGradientDescentRegressor()
+          new OnlineGradientDescentRegressor
           {
             NumIterations = 10,
             L2RegularizerWeight = 0.6f,
@@ -54,7 +54,7 @@
     <example>
       <example name="AP">
         <code language="csharp">
-          new AveragedPerceptronBinaryClassifier()
+          new AveragedPerceptronBinaryClassifier
           {
             NumIterations = 10,
             L2RegularizerWeight = 0.01f,
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
index 62daf1ecf3..ec14c9446b 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
@@ -15,7 +15,7 @@
     <example>
       <example name="PoissonRegression">
         <code language="csharp">
-          new PoissonRegressor()
+          new PoissonRegressor
           {
             MaxIterations = 100,
             L2Weight = 0.6f
diff --git a/src/Microsoft.ML.StandardLearners/Standard/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
index acef787636..de4d4bc85b 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
@@ -7,7 +7,8 @@
         Train an SDCA linear model.
       </summary>
       <remarks>
-        This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
+        This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex,
+        dual coordinate, objective functions.
         The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
         <para>
           Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
@@ -35,7 +36,7 @@
     <example name="StochasticDualCoordinateAscentBinaryClassifier">
       <example>
         <code language="csharp">
-          new StochasticDualCoordinateAscentBinaryClassifier()
+          new StochasticDualCoordinateAscentBinaryClassifier
           {
             MaxIterations = 100,
             NumThreads = 7,
@@ -48,7 +49,7 @@
     <example name="StochasticDualCoordinateAscentClassifier">
       <example>
         <code language="csharp">
-          new StochasticDualCoordinateAscentClassifier()
+          new StochasticDualCoordinateAscentClassifier
           {
             MaxIterations = 100,
             NumThreads = 7,
@@ -63,8 +64,7 @@
           new StochasticDualCoordinateAscentRegressor
           {
             MaxIterations = 100,
-            NumThreads = 5,
-            LossFunction = new
+            NumThreads = 5
           }
         </code>
       </example>
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
index 131f105115..f22dd45bbc 100644
--- a/src/Microsoft.ML.Transforms/Text/doc.xml
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -28,14 +28,14 @@
         <code language="csharp">
           pipeline.Add(new TextFeaturizer(&quot;Features&quot;, &quot;SentimentText&quot;)
           {
-          KeepDiacritics = false,
-          KeepPunctuations = false,
-          TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
-          OutputTokens = true,
-          StopWordsRemover = new PredefinedStopWordsRemover(),
-          VectorNormalizer = TextTransformTextNormKind.L2,
-          CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
-          WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
+            KeepDiacritics = false,
+            KeepPunctuations = false,
+            TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
+            OutputTokens = true,
+            StopWordsRemover = new PredefinedStopWordsRemover(),
+            VectorNormalizer = TextTransformTextNormKind.L2,
+            CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
+            WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
           });
         </code>
       </example>
@@ -43,7 +43,6 @@
 
     <member name="WordTokenizer">
       <summary>
-        The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. 
         The separator is space, but can be specified as any other character (or multiple characters) if needed.
       </summary>
       <remarks>
@@ -57,7 +56,10 @@
     <example name="WordTokenizer">
       <example>
         <code language="csharp">
-          pipeline.Add( new WordTokenizer(&quot;TextColumn&quot;){ TermSeparators = &quot;&apos; &apos;, &apos;\t&apos;, &apos;;&apos;&quot;  } );
+          pipeline.Add( new WordTokenizer(&quot;TextColumn&quot;)
+          { 
+            TermSeparators = &quot;&apos; &apos;, &apos;\t&apos;, &apos;;&apos;&quot;  
+          });
         </code>
       </example>
     </example>
@@ -73,15 +75,17 @@
         Embedding ngrams in a vector space allows their contents to be compared in an efficient manner. 
         The slot values in the vector can be weighted by the following factors:
         <list>
-          <item>term frequency
+          <item>
+            <term>term frequency</term>
             <description> the number of occurrences of the slot in the text</description>
           </item>
-          <item>inverse document frequency
+          <item>
+            <term>inverse document frequency</term>
             <description> a ratio (the logarithm of inverse relative slot frequency)
               that measures the information a slot provides by determining how common or rare it is across the entire text.</description>
           </item>
             <item>
-              term frequency-inverse document frequency
+              <term>term frequency-inverse document frequency</term>
               <description> the product term frequency and the inverse document frequency.</description>
             </item>
         </list>
@@ -93,7 +97,10 @@
       <seealso cref="Microsoft.ML.Transforms.CharacterTokenizer"/>
       <example>
         <code language="csharp">
-          pipeline.Add(new NGramTranslator(&quot;TextColumn&quot;){ Weighting=NgramTransformWeightingCriteria.TfIdf  } );
+          pipeline.Add(new NGramTranslator(&quot;TextColumn&quot;)
+          { 
+            Weighting=NgramTransformWeightingCriteria.TfIdf  
+          });
       </code>
       </example>
     </member>
@@ -116,7 +123,10 @@
     <exaple>
       <example name="SentimentAnalyzer">
         <code language="csharp">
-          pipeline.Add(new SentimentAnalyzer(){ Source = &quot;TextColumn&quot; }  );
+          pipeline.Add(new SentimentAnalyzer()
+          { 
+            Source = &quot;TextColumn&quot; 
+          });
         </code>
       </example>
     </exaple>
@@ -135,7 +145,7 @@
       <seealso cref="Microsoft.ML.Transforms.TextFeaturizer"/>
       <example>
         <code language="csharp">
-          pipeline.Add(new CharacterTokenizer("TextCol1" , "TextCol2" ) );
+          pipeline.Add(new CharacterTokenizer(&quot;TextCol1&quot; , &quot;TextCol2&quot; ));
         </code>
       </example>
     </member>
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index 5d8b8649ad..cb6ef6af25 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -15,7 +15,12 @@
     <example name="CategoricalHashOneHotVectorizer">
       <example>
         <code language="csharp">
-          pipeline.Add(new CategoricalHashOneHotVectorizer(&quot;Text1&quot;) { HashBits = 10, Seed = 314489979, OutputKind = CategoricalTransformOutputKind.Bag });
+          pipeline.Add(new CategoricalHashOneHotVectorizer(&quot;Text1&quot;) 
+          { 
+            HashBits = 10, 
+            Seed = 314489979, 
+            OutputKind = CategoricalTransformOutputKind.Bag 
+          });
         </code>
       </example>
     </example>
@@ -68,7 +73,11 @@
     <example name="CountFeatureSelection">
        <example>
         <code language="csharp">
-          pipeline.Add(new FeatureSelectorByCount() { Column = new[]{ &quot;Feature1&quot; }, Count = 2 });
+          pipeline.Add(new FeatureSelectorByCount
+          { 
+            Column = new[]{ &quot;Feature1&quot; }, 
+            Count = 2 
+          });
         </code>
       </example>
     </example>
@@ -82,12 +91,11 @@
           The mutual information of two random variables X and Y is a measure of the mutual dependence between the variables.
           Formally, the mutual information can be written as:
         </para>
-          <para>I(X;Y) = E[log(p(x,y)) - log(p(x)) - log(p(y))]</para>
+        <para>I(X;Y) = E[log(p(x,y)) - log(p(x)) - log(p(y))]</para>
         <para>where the expectation is taken over the joint distribution of X and Y. 
         Here p(x,y) is the joint probability density function of X and Y, p(x) and p(y) are the marginal probability density functions of X and Y respectively. 
         In general, a higher mutual information between the dependent variable (or label) and an independent variable (or feature) means 
         that the label has higher mutual dependence over that feature.
-        The mutual information feature selection mode selects the features based on the mutual information. 
         It keeps the top SlotsInOutput features with the largest mutual information with the label.
         </para>
       </remarks>
@@ -95,51 +103,33 @@
     <example name="MutualInformationFeatureSelection">
       <example>
         <code language="csharp">
-          pipeline.Add(new FeatureSelectorByMutualInformation() { Column = new[]{ &quot;Feature1&quot; }, SlotsInOutput = 6 });
+          pipeline.Add(new FeatureSelectorByMutualInformation
+          { 
+            Column = new[]{ &quot;Feature1&quot; }, 
+            SlotsInOutput = 6 
+           });
         </code>
       </example>
     </example>
 
-    <member name="MutualInformationFeatureSelection">
-      <summary>
-        Selects the top k slots across all specified columns ordered by their mutual information with the label column.
-      </summary>
-      <remarks>
-        <para>
-          The mutual information of two random variables X and Y is a measure of the mutual dependence between the variables.
-          Formally, the mutual information can be written as:
-        </para>
-        <para>I(X;Y) = E[log(p(x,y)) - log(p(x)) - log(p(y))]</para>
-        <para>
-          where the expectation is taken over the joint distribution of X and Y.
-          Here p(x,y) is the joint probability density function of X and Y, p(x) and p(y) are the marginal probability density functions of X and Y respectively.
-          In general, a higher mutual information between the dependent variable (or label) and an independent variable (or feature) means
-          that the label has higher mutual dependence over that feature.
-          The mutual information feature selection mode selects the features based on the mutual information.
-          It keeps the top SlotsInOutput features with the largest mutual information with the label.
-        </para>
-      </remarks>
-      <example>
-        <code language="csharp">
-          pipeline.Add(new FeatureSelectorByMutualInformation() { Column = new[]{ &quot;Feature1&quot;}, SlotsInOutput = 6 });
-        </code>
-      </example>
-    </member>
-
     <member name="OptionalColumnTransform">
       <summary>
-        If the user wish to create additional columns with a particular type and default values, 
-        or replicated the values from one column to another, changing their type, they can do so using this transform. 
-        This transform can be used as a workaround to create a Label column after deserializing a model, for prediction. 
-        Some transforms in the serialized model operate on the Label column, and would throw errors during prediction if such a column is not found. 
+        Creates a new column with the specified type and default values.
       </summary>
-      <remarks>        
+      <remarks>
+        If the user wish to create additional columns with a particular type and default values,
+        or replicated the values from one column to another, changing their type, they can do so using this transform.
+        This transform can be used as a workaround to create a Label column after deserializing a model, for prediction.
+        Some transforms in the serialized model operate on the Label column, and would throw errors during prediction if such a column is not found.
       </remarks>
     </member>
     <example name="OptionalColumnTransform">
       <example>
         <code language="csharp">
-          pipeline.Add(new OptionalColumnCreator() { Column = new[]{ &quot;OptColumn&quot;} });
+          pipeline.Add(new OptionalColumnCreator 
+          { 
+            Column = new[]{ &quot;OptColumn&quot;} 
+          });
         </code>
       </example>
     </example>
@@ -209,7 +199,10 @@
     <example name="NAReplace">
       <example>
         <code language="csharp">
-          pipeline.Add(new MissingValueSubstitutor(&quot;FeatureCol&quot;){ ReplacementKind = NAReplaceTransformReplacementKind.Mean });
+          pipeline.Add(new MissingValueSubstitutor(&quot;FeatureCol&quot;)
+          { 
+            ReplacementKind = NAReplaceTransformReplacementKind.Mean 
+          });
         </code>
       </example>
     </example>
@@ -252,7 +245,10 @@
     <example name="NAHandle">
       <example>
         <code language="csharp">
-          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;) { ReplaceWith  = NAHandleTransformReplacementKind.Mean });
+          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;) 
+          { 
+              ReplaceWith  = NAHandleTransformReplacementKind.Mean 
+          });
         </code>
       </example>
     </example>
@@ -271,7 +267,10 @@
       <seealso cref=" Microsoft.ML.Transforms.GcNormalize"></seealso>
       <example>
         <code language="csharp">
-          pipeline.Add(new LpNormalizer("FeatureCol"){ NormKind = LpNormNormalizerTransformNormalizerKind.L1Norm});
+          pipeline.Add(new LpNormalizer(&quot;FeatureCol&quot;)
+          { 
+            NormKind = LpNormNormalizerTransformNormalizerKind.L1Norm
+          });
         </code>
       </example>
     </member>
@@ -289,7 +288,10 @@
       <seealso cref=" Microsoft.ML.Transforms.LpNormalizer"></seealso>
       <example>
         <code language="csharp">
-          pipeline.Add(new GlobalContrastNormalizer(&quot;FeatureCol&quot;){ SubMean= false });
+          pipeline.Add(new GlobalContrastNormalizer(&quot;FeatureCol&quot;)
+          { 
+            SubMean= false
+          });
         </code>
       </example>
     </member>
@@ -299,7 +301,7 @@
         Un-groups vector columns into sequences of rows, inverse of Group transform.
        </summary>
       <remarks>
-        <para>This can be thought of as an inverse of the CombinerByContiguousGroupId. 
+        <para>This can be thought of as an inverse of the <see cref="T:Microsoft.ML.Transforms.CombinerByContiguousGroupId"/>.  
         For all specified vector columns ("pivot" columns), performs the "ungroup" (or "unroll") operation as outlined below.
         </para>
         <para>If the only pivot column is called P, and has size K, then for every row of the input we will produce 
@@ -312,9 +314,9 @@
         <list type="bullet">
           <item><description>A number of output rows is controlled by the 'mode' parameter. 
             <list type="bullet">
-              <item>outer<description> it is equal to the maximum length of pivot columns</description></item>
-              <item>inner<description> it is equal to the minimum length of pivot columns</description></item>
-              <item>first<description> it is equal to the length of the first pivot column</description></item>
+              <item><term>outer</term><description> it is equal to the maximum length of pivot columns</description></item>
+              <item><term>inner</term><description> it is equal to the minimum length of pivot columns</description></item>
+              <item><term>first</term><description> it is equal to the length of the first pivot column</description></item>
             </list>
             </description>
           </item>
@@ -331,15 +333,18 @@
     <example name="Ungroup">
       <example>
         <code language="csharp">
-          pipeline.Add(new Segregator(){ Column = new[]{&quot;Column1&quot; }, Mode = UngroupTransformUngroupMode.First} );
+          pipeline.Add(new Segregator
+          { 
+              Column = new[]{&quot;Column1&quot; },
+              Mode = UngroupTransformUngroupMode.First
+          });
         </code>
       </example>
     </example>
 
     <member name="KeyToText">
       <summary>
-        The KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the
-        KeyValues metadata.
+        Helps retrieving the original values from a key column. 
       </summary>
       <remarks>
         The KeyToTextConverter is the complement of the <see  cref="TextToKeyConverter"/> transform.
@@ -368,14 +373,17 @@
        The resulting data will have all the group key columns preserved, 
        and the aggregated columns will become variable-length vectors of the original types.
        <para>This transform essentially performs the following SQL-like operation:</para> 
-       <para>GroupKey1, GroupKey2, ... GroupKeyK, LIST(Value1), LIST(Value2), ... LIST(ValueN)</para> 
+       <para>SELECT GroupKey1, GroupKey2, ... GroupKeyK, LIST(Value1), LIST(Value2), ... LIST(ValueN)</para> 
        <para>FROM Data</para> 
        <para>GROUP BY GroupKey1, GroupKey2, ... GroupKeyK.</para> 
       </remarks>
        <seealso cref="Microsoft.ML.Transforms.Segregator"/>
       <example>
         <code language="csharp">
-          pipeline.Add(new CombinerByContiguousGroupId(){ GroupKey = new []{"Key1", "Key2" } } );
+          pipeline.Add(new CombinerByContiguousGroupId
+          { 
+            GroupKey = new []{&quot;Key1&quot;, &quot;Key2&quot; } 
+          });
         </code>
       </example>
     </member>

From 43f6540037220a15c2c0d0bbdbfaf9b56ea3c3c0 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 18 Jul 2018 13:45:38 -0700
Subject: [PATCH 13/14] addressing Pete's comments.

---
 src/Microsoft.ML.StandardLearners/Standard/doc.xml | 3 +--
 src/Microsoft.ML.Transforms/Text/doc.xml           | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/Microsoft.ML.StandardLearners/Standard/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
index de4d4bc85b..a704827b88 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
@@ -7,8 +7,7 @@
         Train an SDCA linear model.
       </summary>
       <remarks>
-        This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex,
-        dual coordinate, objective functions.
+        This classifier is a trainer based on the Stochastic Dual Coordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.
         The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.
         <para>
           Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.
diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
index f22dd45bbc..d6dc9591b9 100644
--- a/src/Microsoft.ML.Transforms/Text/doc.xml
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -43,7 +43,7 @@
 
     <member name="WordTokenizer">
       <summary>
-        The separator is space, but can be specified as any other character (or multiple characters) if needed.
+        This transform splits the text into words using the separator character(s).
       </summary>
       <remarks>
         The input for this transform is a <see cref="Microsoft.ML.Runtime.Data.DvText">DvText</see> or a vector of <see cref="Microsoft.ML.Runtime.Data.DvText">DvTexts</see>,

From 648425518c45a8d93ca9ad7c4bae098c93c033e5 Mon Sep 17 00:00:00 2001
From: Senja Filipi <sefilipi@microsoft.com>
Date: Wed, 18 Jul 2018 16:10:00 -0700
Subject: [PATCH 14/14] Fixing language around the CharTokenizer description.

---
 src/Microsoft.ML.Transforms/Text/doc.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.Transforms/Text/doc.xml b/src/Microsoft.ML.Transforms/Text/doc.xml
index d6dc9591b9..5f734e1cfd 100644
--- a/src/Microsoft.ML.Transforms/Text/doc.xml
+++ b/src/Microsoft.ML.Transforms/Text/doc.xml
@@ -133,7 +133,7 @@
 
     <member name="CharacterTokenizer">
       <summary>
-        This transform breaks text into individual tokens, each consisting of individual characters.
+        This transform breaks text into individual tokens, each consisting of an individual character.
       </summary>
       <remarks>
       This transform is not typically used on its own, but it is one of the transforms composing the