Statistical algorithms require a concrete way to store the training
data. Here the dataset is represented as a matrix
std::vector<double> with row-major index
mapping
| Matrix dimensions | size_t rows, cols; |
|
| Elements | std::vector<double> data; |
|
| Index mapping | index = i * cols + j; |
#include <vector>
struct Matrix {
size_t rows;
size_t cols;
std::vector<double> data;
Matrix(size_t r, size_t c) : rows(r), cols(c), data(r * c, 0.0) {}
double& at(size_t row, size_t col) {
return data[row * cols + col];
}
const double& at(size_t row, size_t col) const {
return data[row * cols + col];
}
};Given a dataset
For continuous regression, the standard choice is MSE, which penalises predictions proportionally to their squared distance from the truth:
The factor of
A regression tree partitions the feature space into
The goal is to find the regions and constants that minimize the total Sum of Squared Errors:
For a fixed region
So
Finding the globally optimal partition is NP-hard, so instead a greedy
top-down algorithm is used. At each node, every possible split is
considered: a split is defined by a feature index
The algorithm selects the
This is equivalent to maximizing the variance reduction
Since
The tree is stored as a flat std::vector<Node>, where parent nodes
reference their children by vector index. This avoids pointer-based tree
structures and keeps memory contiguous.
feature_idx |
int |
Splitting feature |
threshold |
double |
Split threshold |
prediction |
double |
Leaf constant |
left_child |
int |
Index of child where |
right_child |
int |
Index of child where |
struct Node {
int feature_idx = -1;
double threshold = 0.0;
double prediction = 0.0;
int left_child = -1;
int right_child = -1;
Node(double pred) : prediction(pred) {}
bool is_leaf() const { return feature_idx == -1; }
};For each feature column
The key identity is:
This lets SSE be updated incrementally as observations shift from the
right child to the left, rather than being recomputed from scratch. The
full split search over all
struct SplitResult {
int feature_idx = -1;
double threshold = 0.0;
double best_sse = std::numeric_limits<double>::infinity();
};
SplitResult find_best_split(const Matrix& X, const std::vector<double>& y, const std::vector<int>& indices) {
SplitResult best_split;
size_t n = indices.size();
if (n < 2) return best_split;
for (size_t k = 0; k < X.cols; ++k) {
std::vector<std::pair<double, int>> feature_vals(n);
for (size_t i = 0; i < n; ++i) {
feature_vals[i] = {X.at(indices[i], k), indices[i]};
}
std::sort(feature_vals.begin(), feature_vals.end());
double sum_left = 0.0, sum_right = 0.0;
double sum_sq_left = 0.0, sum_sq_right = 0.0;
for (size_t i = 0; i < n; ++i) {
double target = y[feature_vals[i].second];
sum_right += target;
sum_sq_right += target * target;
}
size_t n_left = 0, n_right = n;
for (size_t i = 0; i < n - 1; ++i) {
double target = y[feature_vals[i].second];
sum_left += target;
sum_sq_left += target * target;
n_left++;
sum_right -= target;
sum_sq_right -= target * target;
n_right--;
if (feature_vals[i].first == feature_vals[i+1].first) continue;
double sse_left = sum_sq_left - (sum_left * sum_left) / n_left;
double sse_right = sum_sq_right - (sum_right * sum_right) / n_right;
double total_sse = sse_left + sse_right;
if (total_sse < best_split.best_sse) {
best_split.best_sse = total_sse;
best_split.feature_idx = k;
best_split.threshold = (feature_vals[i].first + feature_vals[i+1].first) / 2.0;
}
}
}
return best_split;
}The threshold is set to the midpoint between adjacent sorted values, so new observations can be compared cleanly against it.
The tree is built recursively. At each recursive call, find_best_split
is invoked on the current nodeβs index subset. If a valid split is
found, two child nodes are pushed onto the std::vector<Node> and the
function recurses on each.
Recursion stops under the following conditions:
-
Maximum depth reached: the depth parameter equals
max_depth. -
Insufficient observations: fewer than 2 samples remain at the node, making a split impossible.
-
No valid split found: all observations share the same feature value (the split search returns
feature_idx == -1).
When recursion stops, the active node retains its initial prediction
value (set at construction to the mean of its assigned targets), and is
treated as a leaf via is_leaf().
void build_tree_recursive(const Matrix& X, const std::vector<double>& y, const std::vector<int>& indices,
int depth, int max_depth, std::vector<Node>& tree, int node_idx) {
if (depth >= max_depth || indices.size() < 2) return;
SplitResult best = find_best_split(X, y, indices);
if (best.feature_idx == -1) return;
tree[node_idx].feature_idx = best.feature_idx;
tree[node_idx].threshold = best.threshold;
std::vector<int> left_idx, right_idx;
double left_sum = 0.0, right_sum = 0.0;
for (int idx : indices) {
if (X.at(idx, best.feature_idx) <= best.threshold) {
left_idx.push_back(idx);
left_sum += y[idx];
} else {
right_idx.push_back(idx);
right_sum += y[idx];
}
}
if (!left_idx.empty()) {
tree.push_back(Node(left_sum / left_idx.size()));
tree[node_idx].left_child = tree.size() - 1;
build_tree_recursive(X, y, left_idx, depth + 1, max_depth, tree, tree[node_idx].left_child);
}
if (!right_idx.empty()) {
tree.push_back(Node(right_sum / right_idx.size()));
tree[node_idx].right_child = tree.size() - 1;
build_tree_recursive(X, y, right_idx, depth + 1, max_depth, tree, tree[node_idx].right_child);
}
}
std::vector<Node> build_tree(const Matrix& X, const std::vector<double>& y, const std::vector<int>& indices, int max_depth) {
double sum = 0.0;
for (int idx : indices) sum += y[idx];
std::vector<Node> tree;
tree.push_back(Node(sum / indices.size())); // Root node
build_tree_recursive(X, y, indices, 0, max_depth, tree, 0);
return tree;
}To predict for a single observation
double predict_single_tree(const std::vector<Node>& tree, const std::vector<double>& x_i) {
int curr = 0;
while (!tree[curr].is_leaf()) {
if (x_i[tree[curr].feature_idx] <= tree[curr].threshold) {
curr = tree[curr].left_child;
} else {
curr = tree[curr].right_child;
}
}
return tree[curr].prediction;
}A decision tree grown to sufficient depth can perfectly fit any training dataset. This is actually a problem: a single observation changing can alter the root split, cascading into a structurally different tree. The model has high variance.
Ensemble methods address this. Gradient boosting takes a sequential additive approach: instead of growing one deep tree, it combines many shallow, constrained trees (weak learners), each of which corrects the errors of its predecessors.
In parametric models (e.g. linear regression, neural networks), there is
a finite weight vector
The gradient points in the direction of steepest ascent of
Gradient boosting has no fixed weight vector. Instead, the object being
updated is the prediction function
The gradient of the empirical loss with respect to these predictions is computed pointwise:
For
So the negative gradient is just the raw residual:
A gradient descent step in function space then looks like:
The negative gradients
The ensemble built by repeating this process is gradient descent in an infinite-dimensional function space, with trees serving as the update direction at each step.
The algorithm begins with the constant prediction that minimizes the
total loss. For
Each iteration executes the following steps.
Step 1 β Compute pseudo-residuals. For each
Step 2 β Fit a weak learner. Train a regression tree
Step 3 β Compute optimal leaf values. For each leaf region
For
Step 4 β Update the ensemble. Shrink the new tree by learning rate
Choosing
The GradientBoostingRegressor stores the initial prediction, the
learning rate, and the full ensemble of trees:
struct GradientBoostingRegressor {
int n_estimators;
double learning_rate;
int max_depth;
double initial_prediction;
std::vector<std::vector<Node>> ensemble;
GradientBoostingRegressor(int estimators, double lr, int depth)
: n_estimators(estimators), learning_rate(lr),
max_depth(depth), initial_prediction(0.0) {}
void fit(const Matrix& X, const std::vector<double>& Y);
double predict(const std::vector<double>& x_i) const;
double mse(const Matrix& X, const std::vector<double>& Y) const;
};void GradientBoostingRegressor::fit(const Matrix& X, const std::vector<double>& Y) {
size_t N = Y.size();
double sum = 0.0;
for (double y : Y) sum += y;
initial_prediction = sum / N;
std::vector<double> F_m(N, initial_prediction);
std::vector<double> pseudo_residuals(N, 0.0);
std::vector<int> all_indices(N);
std::iota(all_indices.begin(), all_indices.end(), 0);
for (int m = 0; m < n_estimators; ++m) {
for (size_t i = 0; i < N; ++i) {
pseudo_residuals[i] = Y[i] - F_m[i];
}
std::vector<Node> tree = build_tree(X, pseudo_residuals, all_indices, max_depth);
ensemble.push_back(tree);
for (size_t i = 0; i < N; ++i) {
std::vector<double> x_i(X.cols);
for(size_t j = 0; j < X.cols; ++j) {
x_i[j] = X.at(i, j);
}
double tree_pred = predict_single_tree(tree, x_i);
F_m[i] += learning_rate * tree_pred;
}
}
}Because build_tree stores the arithmetic mean of its target values in
each leaf, and the optimal
double GradientBoostingRegressor::predict(const std::vector<double>& x_i) const {
double final_prediction = initial_prediction;
for (size_t m = 0; m < ensemble.size(); ++m) {
final_prediction += learning_rate * predict_single_tree(ensemble[m], x_i);
}
return final_prediction;
}The final prediction reconstructs