diff --git a/Project.toml b/Project.toml index ce13729bb..44c0dda63 100644 --- a/Project.toml +++ b/Project.toml @@ -26,14 +26,19 @@ MathOptSetDistances = "0.1" julia = "1" [extras] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" GLPK = "60bf3e95-4087-53dc-ae20-288a0d20c6a6" Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9" -JuMP = "4076af6c-e467-56ae-b986-b466b2749572" OSQP = "ab2f91bb-94b4-55e3-9ba0-7f65df51de79" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "Clp", "DelimitedFiles", "SCS", "OSQP", "GLPK", "Ipopt", "JuMP"] +test = ["Test", "Clp", "DelimitedFiles", "SCS", "OSQP", "GLPK", "Ipopt", "CSV", "Statistics", "DataFrames", "Flux", "Printf", "HTTP"] diff --git a/README.md b/README.md index 345b89013..ed805bbb6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DiffOpt.jl -[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://jump.dev/DiffOpt.jl/stable) + [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://jump.dev/DiffOpt.jl/dev) [![Build Status](https://github.com/jump-dev/DiffOpt.jl/workflows/CI/badge.svg?branch=master)](https://github.com/jump-dev/DiffOpt.jl/actions?query=workflow%3ACI) [![Coverage](https://codecov.io/gh/jump-dev/DiffOpt.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/jump-dev/DiffOpt.jl) @@ -65,7 +65,8 @@ then one can compute gradients by providing perturbations grads = backward(diff, dA, db, dc) ``` + diff --git a/docs/Project.toml b/docs/Project.toml index 5a2d097f4..bbd3a0b75 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -2,7 +2,16 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Clp = "e2554f3b-3117-50c0-817c-e040a3ddf72d" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" JuMP = "4076af6c-e467-56ae-b986-b466b2749572" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" +MatrixOptInterface = "2f4eb8e6-3e35-4ae4-8c7a-f3d7d9bf20ed" +OSQP = "ab2f91bb-94b4-55e3-9ba0-7f65df51de79" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" SCS = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +Dualization = "191a621a-6537-11e9-281d-650236a99e60" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" \ No newline at end of file diff --git a/docs/make.jl b/docs/make.jl index c8a5668d4..ca565d9af 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,13 +15,19 @@ makedocs(; "Usage" => "usage.md", "Reference" => "reference.md", "Examples" => [ - "Solving an LP" => "solve-LP.md", - "Solving a QP" => "solve-QP.md", "Solving conic with PSD and SOC constraints" => "solve-conic-1.md", "Differentiating a simple QP by hand" => "matrix-inversion-manual.md", - "Sensitivity Analysis SVM" => "sensitivity-analysis-svm.md", + "Sensitivity Analysis" => [ + "SVM" => "sensitivity-analysis-svm.md", + "Ridge Regression" => "sensitivity-analysis-ridge.md", + ], + "Hyper-parameter optimization" => "autotuning-ridge.md", + "Custom Neural Network Layers" => [ + "ReLU Layer" => "custom-relu.md", + "SVM as a Layer" => "custom-svm.md", + ], "ChainRules integration" => "chainrules_unit.md", - ], + ] ], strict = true, # See https://github.com/JuliaOpt/JuMP.jl/issues/1576 repo="https://github.com/jump-dev/DiffOpt.jl", diff --git a/docs/src/autotuning-ridge.md b/docs/src/autotuning-ridge.md new file mode 100644 index 000000000..afac9aebd --- /dev/null +++ b/docs/src/autotuning-ridge.md @@ -0,0 +1,226 @@ +# Auto-tuning Hyperparameters + +This example shows how to learn the hyperparameters in Ridge Regression using a gradient descent routine. Let the problem be modelled as + +```math +\begin{equation} +\min_{w} \quad \frac{1}{2n} \sum_{i=1}^{n} (y_{i} - w^T x_{i})^2 + \alpha \| w \|_2^2 +\end{equation} +``` + +where +- `x`, `y` are the data points +- `w` constitutes weights of the regressing line +- `α` is the only hyperparameter - regularization constant + + + +```@example 3 +using DiffOpt +using Statistics +using OSQP +using JuMP +using Plots +import Random +using LinearAlgebra +nothing # hide +``` + + +```@example 3 +""" +Return the coefficient of determination R2 of the prediction. +Best possible score is 1.0, it can be negative because the model can be arbitrarily worse +""" +function R2(y_true, y_pred) + u = sum((y_pred - y_true).^2) # Regression sum of squares + v = sum((y_true .- mean(y_true)).^2) # Total sum of squares + + return 1-(u/v) +end + +function create_problem(N, D, noise) + w = rand(D) + X = rand(N, D) + + # if noise=0, then there is no need of regularization + # and alpha=0 wi ll give the best R2 pred score + Y = X*w .+ noise*randn(N) + + # test train split + l = N ÷ 2 + return X[1:l, :], X[l+1:N, :], Y[1:l], Y[l+1:N] +end + +X_train, X_test, Y_train, Y_test = create_problem(800, 30, 4); +``` + + +```@example 3 +function fitRidge(X,Y,α) + model = Model(() -> diff_optimizer(OSQP.Optimizer)) + + N, D = size(X) + + # add variables + @variable(model, w[1:D]>=-10) + set_optimizer_attribute(model, MOI.Silent(), true) + + @objective( + model, + Min, + sum((Y - X*w).*(Y - X*w))/(2.0*N) + α*(w'w), + ) + + optimize!(model) + + custom_loss = objective_value(model) + return model, w, custom_loss, value.(w) +end +nothing # hide +``` + + + +```@example 3 +αs = [0.0, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 7e-2, 2e-1, 3e-1, .5, .7, 1.0] +Rs = Float64[] +mse = Float64[] + +for α in αs + _, _, _, w_train = fitRidge(X_train, Y_train, α) + Y_pred = X_test*w_train + push!(Rs, R2(Y_test, Y_pred)) + push!(mse, sum((Y_pred - Y_test).^2)) +end +nothing # hide +``` + + +```@example 3 +plot(log.(αs), Rs*10, label="R2 prediction score", xaxis = ("log(α)")) +``` + + + + +```@example 3 +plot(log.(αs), mse, label="MSE", xaxis = ("log(α)")) +``` + + + + +# Plotting ∂l/∂α + + +```@example 3 +function ∇model(model, X_train, w, ŵ, α) + N, D = size(X_train) + dw = zeros(D) + ∂w_∂α = zeros(D) + + for i in 1:D + dw[i] = 1.0 #set + + MOI.set( + model, + DiffOpt.ForwardInObjective(), + MOI.ScalarQuadraticFunction( + [MOI.ScalarAffineTerm(0.0, w[i].index)], + [MOI.ScalarQuadraticTerm(dw[i]*α, w[i].index, w[i].index)], + 0.0 + ) + ) + + DiffOpt.forward(model) # find grad + + ∂w_∂α[i] = MOI.get( + model, + DiffOpt.ForwardOutVariablePrimal(), + w[i] + ) + + dw[i] = 0.0 #unset + end + return sqrt(ŵ'ŵ) + 2α*(ŵ'∂w_∂α) - sum((X_train*∂w_∂α).*(Y_train - X_train*ŵ))/(2*N) +end +nothing # hide +``` + + + +```@example 3 +∂l_∂αs = Float64[] +N, D = size(X_train) + +for α in αs + model, w, _, ŵ = fitRidge(X_train, Y_train, α) + + # testing optimality + ∂l_∂w = [2*α*ŵ[i] - sum(X_train[:,i].*(Y_train - X_train*ŵ))/N for i in 1:D] + @assert norm(∂l_∂w) < 1e-1 + + push!( + ∂l_∂αs, + ∇model(model, X_train, w, ŵ, α) + ) +end + +plot(αs, ∂l_∂αs, label="∂l/∂α", xaxis = ("α")) +``` + + + + +# Gradient Descent + +```@example 3 +""" + start from initial value of regularization constant + do gradient descent on alpha + until the MSE keeps on decreasing +""" +function descent(α, max_iters=25) + prev_mse = 1e7 + curr_mse = 1e6 + + α_s = Float64[] + mse = Float64[] + + iter=0 + while curr_mse-10 < prev_mse && iter < max_iters + iter += 1 + model, w, _, ŵ = fitRidge(X_train, Y_train, α) + + #update + ∂α = ∇model(model, X_train, w, ŵ, α) + + α += 0.01*∂α # make it grow real slow + + push!(α_s, α) + + Y_pred = X_test*ŵ + + prev_mse = curr_mse + curr_mse = sum((Y_pred - Y_test).^2) + + push!(mse, curr_mse) + end + + return α_s, mse +end +nothing # hide +``` + + +```@example 3 +ᾱ, msē = descent(1.0); +nothing # hide +``` + + +```@example 3 +plot(log.(αs), mse, label="MSE", xaxis = ("α")) +plot!(log.(ᾱ), msē, label="G.D. for α", lw = 2) +``` diff --git a/docs/src/custom-relu.md b/docs/src/custom-relu.md new file mode 100644 index 000000000..5749247fe --- /dev/null +++ b/docs/src/custom-relu.md @@ -0,0 +1,136 @@ +# Custom ReLU layer + +We demonstrate how DiffOpt can be used to generate a simple neural network unit - the ReLU layer. A nueral network is created using Flux.jl which is trained on the MNIST dataset. + + +```@example 1 +using Statistics +using DiffOpt +using Flux +using Flux: onehotbatch, onecold, crossentropy, throttle +using Base.Iterators: repeated +using OSQP +using JuMP +using ChainRulesCore +``` + + +```@example 1 +## prepare data +imgs = Flux.Data.MNIST.images() +labels = Flux.Data.MNIST.labels(); + +# Preprocessing +X = hcat(float.(reshape.(imgs, :))...) #stack all the images +Y = onehotbatch(labels, 0:9); # just a common way to encode categorical variables + +test_X = hcat(float.(reshape.(Flux.Data.MNIST.images(:test), :))...) +test_Y = onehotbatch(Flux.Data.MNIST.labels(:test), 0:9) + +# float64 to float16, to save memory +X = convert(Array{Float16,2}, X) +test_X = convert(Array{Float16,2}, test_X) + +X = X[:, 1:1000] +Y = Y[:, 1:1000] +nothing # hide +``` + + +```@example 1 +""" +relu method for a Matrix +""" +function matrix_relu(y::AbstractMatrix{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T} + _x = zeros(T, size(y)) + + # model init + N = length(y[:, 1]) + empty!(model) + set_optimizer_attribute(model, MOI.Silent(), true) + @variable(model, x[1:N] >= zero(T)) + + for i in 1:size(y)[2] + @objective( + model, + Min, + x'x -2x'y[:, i] + ) + optimize!(model) + _x[:, i] = value.(x) + end + return _x +end +nothing # hide +``` + + + + +```@example 1 +function ChainRulesCore.rrule(::typeof(matrix_relu), y::AbstractArray{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T} + + pv = matrix_relu(y, model=model) + + function pullback_matrix_relu(dx) + x = model[:x] + dy = zeros(T, size(dx)) + + for i in 1:size(y)[2] + MOI.set.( + model, + DiffOpt.BackwardInVariablePrimal(), + x, + dx[:, i] + ) + + DiffOpt.backward(model) # find grad + + # fetch the objective expression + obj_exp = MOI.get(model, DiffOpt.BackwardOutObjective()) + + dy[:, i] = JuMP.coefficient.(obj_exp, x) # coeff of `x` in -2x'y + dy[:, i] = -2 * dy[:, i] + end + + return (NO_FIELDS, dy) + end + return pv, pullback_matrix_relu +end +``` + +## Define the Network + + +```@example 1 +m = Chain( + Dense(784, 64), + matrix_relu, + Dense(64, 10), + softmax, +) +nothing # hide +``` + + +```@example 1 +custom_loss(x, y) = crossentropy(m(x), y) +opt = ADAM(); # popular stochastic gradient descent variant + +accuracy(x, y) = mean(onecold(m(x)) .== onecold(y)) # cute way to find average of correct guesses + +dataset = repeated((X,Y), 20) # repeat the data set, very low accuracy on the orig dataset +evalcb = () -> @show(custom_loss(X, Y)) # callback to show loss +nothing # hide +``` + + +Although our custom implementation takes time, it is able to reach similar accuracy as the usual ReLU function implementation. + +```@example 1 +Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); + +@show accuracy(X,Y) +@show accuracy(test_X, test_Y) +nothing # hide +``` diff --git a/docs/src/custom-svm.md b/docs/src/custom-svm.md new file mode 100644 index 000000000..8b693a6e4 --- /dev/null +++ b/docs/src/custom-svm.md @@ -0,0 +1,147 @@ +# Custom SVM layer + +This example demonstrates implementing an SVM using DiffOpt. This SVM is used as a layer in a neural network trained to predict survivors on the [Kaggle Titanic Dataset](https://www.kaggle.com/c/titanic). + +```@example 1 +using Statistics +using DiffOpt +using Flux +using Flux: onecold, binarycrossentropy, throttle, logitcrossentropy, crossentropy +using Base.Iterators: repeated +using JuMP +using SCS +using CSV +using DataFrames +using ChainRulesCore +using HTTP + +const DATA_URL = "https://raw.githubusercontent.com/be-apt/jump-gsoc-2020/master/titanic_preprocessed.csv" +nothing # hide +``` + + +```@example 1 +labels = NaN; # hack for the SVM +nothing # hide +``` + +```@example 1 +""" + SVM as a Flux layer +""" +function SVM(X::AbstractMatrix{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T} + D, N = size(X) + + Y = vec([y >= 0.5 ? 1 : -1 for y in labels]') + # scale class from 0,1 to -1,1. required for SVM + + # model init + empty!(model) + set_optimizer_attribute(model, MOI.Silent(), true) + + # add variables + @variable(model, l[1:N]) + @variable(model, w[1:D]) + @variable(model, b) + + @constraint(model, cons, Y.*(X'*w .+ b) + l.-1 ∈ MOI.Nonnegatives(N)) + @constraint(model, 1.0*l ∈ MOI.Nonnegatives(N)); + + @objective( + model, + Min, + sum(l), + ) + + optimize!(model) + + wv = value.(w) + bv = value(b) + + return (X'*wv .+ bv)' #prediction +end +nothing # hide +``` + + + +```@example 1 +function ChainRulesCore.rrule(::typeof(SVM), X::AbstractArray{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T} + + predictions = SVM(X, model=model) + + """ + model[:w], model[:b] are the weights of this layer + they are not updated using backward pass + since they can be computed to an accurate degree using a solver + """ + function pullback_SVM(dX) + dy = zeros(T, size(dX)) # since w# + return (NO_FIELDS, dy) + end + return predictions, pullback_SVM +end +``` + + +```@example 1 +function fetchProblem(;split_ratio::Float64) + df = CSV.File(HTTP.get(DATA_URL).body) |> DataFrame + + Y = df[:, 2] + X = df[!, 3:12] + X = Matrix(X)' + + D, N = size(X) + + l = Int(floor(length(Y)*split_ratio)) + return X[:, 1:l], X[:, l+1:N], Y[1:l]', Y[l+1:N]' +end +X_train, X_test, Y_train, Y_test = fetchProblem(split_ratio=0.8) +D = size(X_train)[1] +nothing # hide +``` + +## Define the NN + + +```@example 1 +m = Chain( + Dense(D, 16, relu), + Dropout(0.5), + SVM +) +nothing # hide +``` + + +```@example 1 +custom_loss(x, y) = logitcrossentropy(m(x), y) +opt = ADAM(); # popular stochastic gradient descent variant + +classify(x::Float64) = (x>=0.5) ? 1 : 0 + +function accuracy(x, y_true) + y_pred = classify.(m(x)) + return sum(y_true .≈ y_pred) / length(y_true) +end + +dataset = repeated((X_train,Y_train), 1) # repeat the data set, very low accuracy on the orig dataset +evalcb = () -> @show(custom_loss(X_train,Y_train)) # callback to show loss +nothing # hide +``` + + + +```@example 1 +labels = Y_train # needed for SVM +for iter in 1:1 + Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU +end + +@show accuracy(X_train, Y_train) + +labels = Y_test # needed for SVM +@show accuracy(X_test, Y_test) +nothing # hide +``` diff --git a/docs/src/matrix-inversion-manual.md b/docs/src/matrix-inversion-manual.md index 3e86a1d3e..33f220950 100644 --- a/docs/src/matrix-inversion-manual.md +++ b/docs/src/matrix-inversion-manual.md @@ -120,7 +120,7 @@ gradh = tape.gradient(summed_solution, [h_tf]) ## Finding Jacobian using MOI, Dualization.jl, LinearAlgebra.jl -```julia +```@example 1 using Random using MathOptInterface using Dualization @@ -193,11 +193,5 @@ end LHS = [4 1 1; 1 2 1; 1 1 0] # of Eqn (6) RHS = [0; 0; 1] # of Eqn (6) -pp \ qq # the jacobian +LHS \ RHS # the required jacobian ``` - - - 3-element Array{Float64,1}: - 0.25 - 0.75 - -1.75 diff --git a/docs/src/sensitivity-analysis-ridge.md b/docs/src/sensitivity-analysis-ridge.md new file mode 100644 index 000000000..c3f68612f --- /dev/null +++ b/docs/src/sensitivity-analysis-ridge.md @@ -0,0 +1,122 @@ +# Sensitivity Analysis of Ridge Regression using DiffOpt.jl + +This example illustrates the sensitivity analysis of data points in a [Ridge Regression](https://en.wikipedia.org/wiki/Ridge_regression) problem. The general form of the problem is given below: + +```math +\begin{split} +\begin{array} {ll} +\mbox{minimize} & \sum_{i=1}^{N} (y_{i} - w x_{i} - b)^2 + \alpha (w^2 + b^2) \\ +\end{array} +\end{split} +``` +where +- `w`, `b` are slope and intercept of the regressing line +- `x`, `y` are the N data points +- `α` is the regularization constant + +## Define and solve the problem +```@example 2 +import Random +import OSQP +import Plots +using DiffOpt +using JuMP +using LinearAlgebra +``` + +Construct a set of noisy (guassian) data points around a line. +```@example 2 +function create_problem(N=100) + m = 2*abs(randn()) + b = rand() + X = randn(N) + Y = m*X .+ b + 0.8*randn(N) + + return X, Y +end + +X, Y = create_problem(); +nothing # hide +``` + +The helper method `fitRidge` defines and solves the corresponding model. +```@example 2 +function fitRidge(X,Y,alpha=0.1) + model = Model(() -> diff_optimizer(OSQP.Optimizer)) + + # add variables + @variable(model, w) + @variable(model, b) + set_optimizer_attribute(model, MOI.Silent(), true) + + @objective( + model, + Min, + sum((Y - w*X .- b).*(Y - w*X .- b)) + alpha*(sum(w*w)+sum(b*b)), + ) + + optimize!(model) + + loss = objective_value(model) + return model, w, b, loss, value(w), value(b) +end +nothing # hide +``` + +Train on the data generated. +```@example 2 +model, w, b, loss_train, ŵ, b̂ = fitRidge(X, Y) +nothing # hide +``` + +We can visualize the approximating line. +```@example 2 +p = Plots.scatter(X, Y, label="") +mi, ma = minimum(X), maximum(X) +Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="") +nothing # hide +``` + + +## Differentiate +Now that we've solved the problem, we can compute the sensitivity of optimal values -- the approximating line components `w`, `b` in this case -- with respect to perturbations in the data points (`x`,`y`). +```@example 2 +∇ = zero(X) + +for i in 1:length(X) + MOI.set( + model, + DiffOpt.ForwardInObjective(), + MOI.ScalarQuadraticFunction( + [MOI.ScalarAffineTerm(-2(Y[1] + X[1]), w.index)], + [MOI.ScalarQuadraticTerm(2X[1], w.index, w.index)], + 0.0 + ) + ) + + DiffOpt.forward(model) + + db = MOI.get( + model, + DiffOpt.ForwardOutVariablePrimal(), + b + ) + + ∇[i] = db +end +normalize!(∇); +nothing # hide +``` + +Visualize point sensitivities with respect to regressing line. Note that the gradients are normalized. +```@example 2 +p = Plots.scatter( + X, Y, + color=[x>0 ? :red : :blue for x in ∇], + markersize=[25*abs(x) for x in ∇], + label="" +) +mi, ma = minimum(X), maximum(X) +Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="") +nothing # hide +``` diff --git a/docs/src/sensitivity-analysis-svm-img-1.svg b/docs/src/sensitivity-analysis-svm-img-1.svg deleted file mode 100644 index b21f60722..000000000 --- a/docs/src/sensitivity-analysis-svm-img-1.svg +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/src/sensitivity-analysis-svm-img-2.svg b/docs/src/sensitivity-analysis-svm-img-2.svg deleted file mode 100644 index d20435b76..000000000 --- a/docs/src/sensitivity-analysis-svm-img-2.svg +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/src/sensitivity-analysis-svm-img-3.svg b/docs/src/sensitivity-analysis-svm-img-3.svg deleted file mode 100644 index 92efcf853..000000000 --- a/docs/src/sensitivity-analysis-svm-img-3.svg +++ /dev/null @@ -1,214 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/src/sensitivity-analysis-svm.md b/docs/src/sensitivity-analysis-svm.md index 20a4d489c..500408ba6 100644 --- a/docs/src/sensitivity-analysis-svm.md +++ b/docs/src/sensitivity-analysis-svm.md @@ -23,7 +23,6 @@ Import the libraries. ```@example 1 import Random -using Test import SCS import Plots using DiffOpt @@ -37,9 +36,9 @@ Construct separatable, non-trivial data points. ```@example 1 N = 100 D = 2 -Random.seed!(6) -X = vcat(randn(N, D), randn(N,D) .+ [4.0,1.5]') -y = append!(ones(N), -ones(N)) +Random.seed!(62) +X = vcat(randn(N÷2, D), randn(N÷2,D) .+ [4.0,1.5]') +y = append!(ones(N÷2), -ones(N÷2)) nothing # hide ``` @@ -111,11 +110,8 @@ svm_y = (-bv .- wv[1] * svm_x )/wv[2] p = Plots.scatter(X[:,1], X[:,2], color = [yi > 0 ? :red : :blue for yi in y], label = "") Plots.yaxis!(p, (-2, 4.5)) Plots.plot!(p, svm_x, svm_y, label = "loss = $(round(loss, digits=2))", width=3) -Plots.savefig("svm_separating.svg") -nothing # hide ``` -![svg](svm_separating.svg) # Experiments Now that we've solved the SVM, we can compute the sensitivity of optimal values -- the separating hyperplane in our case -- with respect to perturbations of the problem data -- the data points -- using DiffOpt. For illustration, we've explored two questions: @@ -206,17 +202,13 @@ p2 = Plots.scatter( ) Plots.yaxis!(p2, (-2, 4.5)) Plots.plot!(p2, svm_x, svm_y, label = "loss = $(round(loss, digits=2))", width=3) -Plots.savefig("sensitivity2.svg") -nothing # hide ``` -![](sensitivity2.svg) - ## Experiment 2: Gradient of hyperplane wrt the data point coordinates Similar to previous example, construct perturbations in data points coordinates `X`. -```julia +```@example 1 ∇ = Float64[] dX = zeros(N, D) @@ -229,7 +221,7 @@ for Xi in 1:N model, DiffOpt.ForwardInConstraint(), cons, - MOI.Utilities.vectorize(dX[:,i] .* w[i]), + MOI.Utilities.vectorize(dX[:,i] .* MOI.SingleVariable(w[i])), ) end @@ -261,8 +253,4 @@ p3 = Plots.scatter( ) Plots.yaxis!(p3, (-2, 4.5)) Plots.plot!(p3, svm_x, svm_y, label = "loss = $(round(loss, digits=2))", width=3) -Plots.savefig(p3, "sensitivity3.svg") -nothing # hide ``` - -![](sensitivity3.svg) diff --git a/docs/src/solve-LP.md b/docs/src/solve-LP.md deleted file mode 100644 index 6f5a44db1..000000000 --- a/docs/src/solve-LP.md +++ /dev/null @@ -1,158 +0,0 @@ -# Solving LP primal - - -```julia -using Random -using GLPK -using MathOptInterface -using Dualization - -const MOI = MathOptInterface -const MOIU = MathOptInterface.Utilities; -``` - - -```julia -D = 10 # variable dimension -N = 20; # no of inequality constraints -``` - -## create a non-trivial LP problem -$$\text{min } c^Tx$$ -$$\text{s.t. } Ax \leq b$$ - $$x \geq 0, x \in R^D$$ - - -```julia -s = rand(N) -s = 2*s.-1 -λ = max.(-s, 0) -s = max.(s, 0) -x̂ = rand(D) -A = rand(N, D) -b = A*x̂ .+ s -c = -A'*λ; -``` - - -```julia -# can feed dual problem to optimizer like this: -# model = MOI.instantiate(dual_optimizer(GLPK.Optimizer), with_bridge_type=Float64) - -model = GLPK.Optimizer() -x = MOI.add_variables(model, D) - -# define objective -objective_function = MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(c, x), 0.0) -MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarAffineFunction{Float64}}(), objective_function) -MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE) -``` - - -```julia -# will be useful later -constraint_indices = [] - -# set constraints -for i in 1:N - push!(constraint_indices, MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(A[i,:], x), 0.),MOI.LessThan(b[i]))) -end - -for i in 1:D - push!(constraint_indices, MOI.add_constraint(model,MOI.SingleVariable(x[i]),MOI.GreaterThan(0.))) -end -``` - - -```julia -MOI.optimize!(model) -``` - - -```julia -@assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL] -``` - - -```julia -x̄ = MOI.get(model, MOI.VariablePrimal(), x); # solution -``` - - -```julia -@assert abs(c'x̄ - c'x̂) <= 1e-8 # sanity check -``` - -## find and solve dual problem - -| primal | dual | -|--------|------| -$$\text{min } c^Tx$$ | $$\text{max } b^Ty$$ -$$\text{s.t. } Ax \leq b$$ | $$\text{s.t. } A^Ty \geq c$$ - $$x \geq 0$$ | $$y \leq 0$$ - -- Each primal variable becomes a dual constraint -- Each primal constraint becomes a dual variable - - -```julia -joint_object = dualize(model) -dual_model_like = joint_object.dual_model # this is MOI.ModelLike, not an MOI.AbstractOptimizer; can't call optimizer on it -primal_dual_map = joint_object.primal_dual_map; -``` - - -```julia -# copy the dual model objective, constraints, and variables to an optimizer -dual_model = GLPK.Optimizer() -MOI.copy_to(dual_model, dual_model_like) - -# solve dual -MOI.optimize!(dual_model); -``` - - -```julia -# NOTE: You can obtain components of the dual model individually by - -# dual_objective = dual_model_like.objective # b'y -# dual_variable_indices = [primal_dual_map.primal_con_dual_var[x][1] for x in constraint_indices] -# dual_constraint_indices = [primal_dual_map.primal_var_dual_con[i] for i in x]; - -# ŷ = MOI.get(dm, MOI.VariablePrimal(), dual_variable_indices) -``` - - -```julia -# check if strong duality holds -@assert abs(MOI.get(model, MOI.ObjectiveValue()) - MOI.get(dual_model, MOI.ObjectiveValue())) <= 1e-8 -``` - -## derive and verify KKT conditions - -**complimentary slackness**: $$\mu_{i}(A\bar x -b)_i=0,\quad \mu_{j+N} \bar x_j =0 \qquad \text{ where } i=1..N, j = 1..D$$ - - -```julia -is_less_than(set::S) where {S<:MOI.AbstractSet} = false -is_less_than(set::MOI.LessThan{T}) where T = true - -map = primal_dual_map.primal_con_dual_var -for con_index in keys(map) - con_value = MOI.get(model, MOI.ConstraintPrimal(), con_index) - set = MOI.get(model, MOI.ConstraintSet(), con_index) - μ = MOI.get(dual_model, MOI.VariablePrimal(), map[con_index][1]) - - if is_less_than(set) - # μ[i]*(Ax - b)[i] = 0 - @assert μ*(con_value - set.upper) < 1e-10 - else - # μ[j]*x[j] = 0 - @assert μ*(con_value - set.lower) < 1e-10 - end -end -``` - - -```julia - -``` diff --git a/docs/src/solve-QP.md b/docs/src/solve-QP.md deleted file mode 100644 index 7ad7a8808..000000000 --- a/docs/src/solve-QP.md +++ /dev/null @@ -1,196 +0,0 @@ -# Solving QP primal - - -```julia -using Random -using MathOptInterface -using Dualization -using OSQP - -const MOI = MathOptInterface -const MOIU = MathOptInterface.Utilities; -``` - - -```julia -n = 20 # variable dimension -m = 15 # no of inequality constraints -p = 15; # no of equality constraints -``` - -## create a non-trivial QP problem -$$\text{min } \frac{1}{2}x^TQx + q^Tx$$ -$$\text{s.t. }Gx <= h$$ - $$Ax = b$$ - - -```julia -x̂ = rand(n) -Q = rand(n, n) -Q = Q'*Q # ensure PSD -q = rand(n) -G = rand(m, n) -h = G*x̂ + rand(m) -A = rand(p, n) -b = A*x̂; -``` - - -```julia -model = MOI.instantiate(OSQP.Optimizer, with_bridge_type=Float64) -x = MOI.add_variables(model, n); -``` - - -```julia -# define objective - -quad_terms = MOI.ScalarQuadraticTerm{Float64}[] -for i in 1:n - for j in i:n # indexes (i,j), (j,i) will be mirrored. specify only one kind - push!(quad_terms, MOI.ScalarQuadraticTerm(Q[i,j],x[i],x[j])) - end -end - -objective_function = MOI.ScalarQuadraticFunction(MOI.ScalarAffineTerm.(q, x),quad_terms,0.0) -MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarQuadraticFunction{Float64}}(), objective_function) -MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE) -``` - - -```julia -# maintain constrain to index map - will be useful later -constraint_map = Dict() - -# add constraints -for i in 1:m - ci = MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(G[i,:], x), 0.),MOI.LessThan(h[i])) - constraint_map[ci] = i -end - -for i in 1:p - ci = MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(A[i,:], x), 0.),MOI.EqualTo(b[i])) - constraint_map[ci] = i -end -``` - - -```julia -MOI.optimize!(model) -``` - - -```julia -@assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL] -``` - - -```julia -x̄ = MOI.get(model, MOI.VariablePrimal(), x); -``` - - -```julia -# objective value (predicted vs actual) sanity check -@assert 0.5*x̄'*Q*x̄ + q'*x̄ <= 0.5*x̂'*Q*x̂ + q'*x̂ -``` - -## find and solve dual problem - -| primal | dual | -|--------|------| -$$\text{min } \frac{1}{2}x^TQx + q^Tx$$ | $$\text{max } -\frac{1}{2}y^TQ^{-1}y - u^Th - v^Tb$$ -$$\text{s.t. }Gx <= h$$ | $$\text{s.t. } u \geq 0, u \in R^m, v \in R^n$$ - $$Ax = b$$ | $$y = q + G^Tu + A^Tv$$ - -- Each primal variable becomes a dual constraint -- Each primal constraint becomes a dual variable - - -```julia -# NOTE: can't use Ipopt -# Ipopt.Optimizer doesn't supports accessing MOI.ObjectiveFunctionType - -joint_object = dualize(model) -dual_model_like = joint_object.dual_model # this is MOI.ModelLike, not an MOI.AbstractOptimizer; can't call optimizer on it -primal_dual_map = joint_object.primal_dual_map; -``` - - -```julia -# copy the dual model objective, constraints, and variables to an optimizer -dual_model = MOI.instantiate(OSQP.Optimizer, with_bridge_type=Float64) -MOI.copy_to(dual_model, dual_model_like) - -# solve dual -MOI.optimize!(dual_model); -``` - - -```julia -# check if strong duality holds -@assert abs(MOI.get(model, MOI.ObjectiveValue()) - MOI.get(dual_model, MOI.ObjectiveValue())) <= 1e-1 -``` - -## derive and verify KKT conditions - - -```julia -is_equality(set::S) where {S<:MOI.AbstractSet} = false -is_equality(set::MOI.EqualTo{T}) where T = true - -map = primal_dual_map.primal_con_dual_var; -``` - -**complimentary slackness**: $$\mu_{i}(G\bar x -h)_i=0 \qquad \text{ where } i=1..m$$ - - -```julia -for con_index in keys(map) - # NOTE: OSQP.Optimizer doesn't allows access to MOI.ConstraintPrimal - # That's why I defined a custom map - - set = MOI.get(model, MOI.ConstraintSet(), con_index) - μ = MOI.get(dual_model, MOI.VariablePrimal(), map[con_index][1]) - - if !is_equality(set) - # μ[i]*(Gx - h)[i] = 0 - i = constraint_map[con_index] - - # println(μ," - ",G[i,:]'*x̄, " - ",h[i]) - # TODO: assertion fails - @assert μ*(G[i,:]'*x̄ - h[i]) < 1e-1 - end -end -``` - -**primal feasibility**: -$$(G\bar x -h)_i=0 \qquad \text{ where } i=1..m$$ -$$(A\bar x -b)_j=0 \qquad \text{ where } j=1..p$$ - -**dual feasibility**: -$$\mu_i \geq 0 \qquad \text{ where } i=1..m$$ - - -```julia -for con_index in keys(map) - # NOTE: OSQP.Optimizer doesn't allows access to MOI.ConstraintPrimal - # That's why I defined a custom map - - set = MOI.get(model, MOI.ConstraintSet(), con_index) - μ = MOI.get(dual_model, MOI.VariablePrimal(), map[con_index][1]) - i = constraint_map[con_index] - - if is_equality(set) - # (Ax - h)[i] = 0 - @assert abs(A[i,:]'*x̄ - b[i]) < 1e-2 - else - # (Gx - h)[i] = 0 - @assert G[i,:]'*x̄ - h[i] < 1e-2 - - # μ[i] >= 0 - # TODO: assertion fails - @assert μ > -1e-2 - end -end -``` diff --git a/docs/src/solve-conic-1.md b/docs/src/solve-conic-1.md index 39c1b1612..59c6d9387 100644 --- a/docs/src/solve-conic-1.md +++ b/docs/src/solve-conic-1.md @@ -54,68 +54,9 @@ X \in \mathbb{S}^n: z^T X z \geq 0, \quad \forall z \in \mathbb{R}^n > Refered from Mosek examples: https://docs.mosek.com/9.2/toolbox/tutorial-sdo-shared.html#example-sdo1 -## Equivalent DiffCP program to differentiate -```python -import numpy as np -import cvxpy as cp -from scipy import sparse -import diffcp - -A = sparse.csc_matrix((11+1,7+1), dtype=np.float64) -A[2 , 1] = 1.0 -A[3 , 1] = -1.0 -A[9 , 1] = -0.45 -A[10, 1] = 0.45 -A[11, 1] = -0.45 -A[2 , 2] = 1.0 -A[4 , 2] = -1.0 -A[9 , 2] = -0.8 -A[10, 2] = 0.318198 -A[11, 2] = -0.1 -A[2 , 3] = 1.0 -A[5 , 3] = -1.0 -A[9 , 3] = -0.9 -A[2 , 4] = 1.0 -A[6 , 4] = -1.0 -A[9 , 4] = -0.225 -A[2 , 5] = 1.0 -A[7 , 5] = -1.0 -A[9 , 5] = -0.1125 -A[10, 5] = 0.1125 -A[11, 5] = -0.1125 -A[2 , 6] = 1.0 -A[8 , 6] = -1.0 -A[11, 6] = -0.225 -A[9 , 7] = 1.0 -A[11, 7] = 1.0 - -A = A[1:, 1:] - -# equivalent to: https://github.com/jump-dev/MathOptInterface.jl/blob/master/src/Test/contconic.jl#L2575 - -cone_dict = { - diffcp.POS: 7, - diffcp.PSD: [2], - diffcp.ZERO: 1 -} - -b = np.array([0.0, 10.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0]) -c = np.array([-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -1.0]) - -x, y, s, D, DT = diffcp.solve_and_derivative(A, b, c, cone_dict) -print(x) # MOI.VariablePrimal -print(s) # MOI.ConstraintPrimal -print(y) # MOI.ConstraintDual - - -dx, dy, ds = D(sparse.csc_matrix(np.ones((11,7))), np.ones(11), np.ones(7)) -print(dx) -print(ds) -print(dy) -``` - ## Equivalent DiffOpt program -```julia +- Instantiate the program with constraints +```@example 1 using SCS using DiffOpt using MathOptInterface @@ -170,6 +111,11 @@ c2 = MOI.add_constraint( ), MOI.Zeros(1) ) +nothing # hide +``` + +- Add objective and solve +```@example 1 objXidx = [1:3; 5:6] objXcoefs = 2*ones(5) @@ -185,20 +131,25 @@ x_sol = MOI.get(model, MOI.VariablePrimal(), vcat(X, x)) s_sol = MOI.get(model, MOI.ConstraintPrimal(), [cX, cx, c1, c2]) y_sol = MOI.get(model, MOI.ConstraintDual(), [cX, cx, c1, c2]) -println("x -> ", round.(x_sol; digits=3)) -println("s -> ", round.(s_sol; digits=3)) -println("y -> ", round.(y_sol; digits=3)) +@show x_sol +@show s_sol +@show y_sol +nothing # hide +``` + +- Now differentiate with perturbation +```@example 1 # perturbations in all the parameters -fx = MOI.SingleVariable.(x) +fx = MOI.SingleVariable.(vcat(X, x)) MOI.set(model, DiffOpt.ForwardInConstraint(), c1, MOI.Utilities.vectorize(ones(1, 9) * fx + ones(1))) MOI.set(model, - DiffOpt.ForwardInConstraint(), c2, MOI.Utilities.vectorize(ones(6, 9) * fx + ones(6))) + DiffOpt.ForwardInConstraint(), cX, MOI.Utilities.vectorize(ones(6, 9) * fx + ones(6))) MOI.set(model, - DiffOpt.ForwardInConstraint(), c3, MOI.Utilities.vectorize(ones(3, 9) * fx + ones(3))) + DiffOpt.ForwardInConstraint(), cx, MOI.Utilities.vectorize(ones(3, 9) * fx + ones(3))) MOI.set(model, - DiffOpt.ForwardInConstraint(), c4, MOI.Utilities.vectorize(ones(1, 9) * fx + ones(1))) + DiffOpt.ForwardInConstraint(), c2, MOI.Utilities.vectorize(ones(1, 9) * fx + ones(1))) # differentiate and get the gradients DiffOpt.forward(model) @@ -206,7 +157,6 @@ DiffOpt.forward(model) dx = MOI.get.(model, DiffOpt.ForwardOutVariablePrimal(), vcat(X, x)) -println("dx -> ", round.(dx; digits=3)) -# println("ds -> ", round.(ds; digits=3)) -# println("dy -> ", round.(dy; digits=3)) +@show dx +nothing # hide ``` diff --git a/docs/src/usage.md b/docs/src/usage.md index 91833656f..07e255698 100644 --- a/docs/src/usage.md +++ b/docs/src/usage.md @@ -2,18 +2,19 @@ Create a differentiable model from [existing optimizers](https://www.juliaopt.org/JuMP.jl/stable/installation/) ```julia - using DiffOpt - using SCS - - model = diff_optimizer(SCS.Optimizer) +using DiffOpt +using SCS +using JuMP + +model = diff_optimizer(SCS.Optimizer) ``` Update and solve the model ```julia - x = MOI.add_variables(model, 2) - c = MOI.add_constraint(model, ...) - - MOI.optimize!(model) +x = MOI.add_variables(model, 2) +c = MOI.add_constraint(model, ...) + +MOI.optimize!(model) ``` Finally differentiate the model (primal and dual variables specifically) to obtain product of jacobians with respect to problem parameters and a backward pass vector. Currently `DiffOpt` supports two backends for differentiating a model: diff --git a/examples/autotuning-ridge.jl b/examples/autotuning-ridge.jl new file mode 100644 index 000000000..ec7112242 --- /dev/null +++ b/examples/autotuning-ridge.jl @@ -0,0 +1,177 @@ +""" + Source code for the example given in autotuning-ridge.md +""" + +using DiffOpt +using Statistics +using OSQP +using JuMP +import Random +using LinearAlgebra +using Printf +Base.show(io::IO, f::Float64) = @printf(io, "%1.3f", f) # to reduce float precision while printing + +# optional Plot to integrate in tests +# set ENV["BUILD_PLOT"] = "1" to build plots +const should_plot = get(ENV, "BUILD_PLOT", nothing) == "1" +if should_plot + using Plots +end + +""" + Return the coefficient of determination R2 of the prediction. + best possible score is 1.0 + it can be negative (because the model can be arbitrarily worse) +""" +function R2(y_true, y_pred) + u = sum((y_pred - y_true).^2) # Regression sum of squares + v = sum((y_true .- mean(y_true)).^2) # Total sum of squares + + return 1-(u/v) +end + +function create_problem(N, D, noise) + w = rand(D) + X = rand(N, D) + + # if noise=0, then there is no need of regularization + # and alpha=0 will give the best R2 pred score + Y = X*w .+ noise*randn(N) + + # test train split + l = Int(N*0.5) + return X[1:l, :], X[l+1:N, :], Y[1:l], Y[l+1:N] +end + +X_train, X_test, Y_train, Y_test = create_problem(800, 30, 4); + +function fitRidge(X,Y,α) + model = Model(() -> diff_optimizer(OSQP.Optimizer)) + + N, D = size(X) + + # add variables + @variable(model, w[1:D]>=-10) + set_optimizer_attribute(model, MOI.Silent(), true) + + @objective( + model, + Min, + sum((Y - X*w).*(Y - X*w))/(2.0*N) + α*(w'w), + ) + + optimize!(model) + + loss = objective_value(model) + return model, w, loss, value.(w) +end + +αs = [0.0, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 7e-2, 2e-1, 3e-1, .5, .7, 1.0] +Rs = Float64[] +mse = Float64[] + +for α in αs + _, _, _, w_train = fitRidge(X_train, Y_train, α) + Y_pred = X_test*w_train + push!(Rs, R2(Y_test, Y_pred)) + push!(mse, sum((Y_pred - Y_test).^2)) +end + +if should_plot + plot(log.(αs), Rs*10, label="R2 prediction score", xaxis = ("log(α)")) + plot(log.(αs), mse, label="MSE", xaxis = ("log(α)")) +end + +function ∇model(model, X_train, w, ŵ, α) + N, D = size(X_train) + dw = zeros(D) + ∂w_∂α = zeros(D) + + for i in 1:D + dw[i] = 1.0 #set + + MOI.set( + model, + DiffOpt.ForwardInObjective(), + MOI.ScalarQuadraticFunction( + [MOI.ScalarAffineTerm(0.0, w[i].index)], + [MOI.ScalarQuadraticTerm(dw[i]*α, w[i].index, w[i].index)], + 0.0 + ) + ) + + DiffOpt.forward(model) # find grad + + ∂w_∂α[i] = MOI.get( + model, + DiffOpt.ForwardOutVariablePrimal(), + w[i] + ) + + dw[i] = 0.0 #unset + end + return sqrt(ŵ'ŵ) + 2α*(ŵ'∂w_∂α) - sum((X_train*∂w_∂α).*(Y_train - X_train*ŵ))/(2*N) +end + +∂l_∂αs = Float64[] +N, D = size(X_train) + +for α in αs + model, w, _, ŵ = fitRidge(X_train, Y_train, α) + + # testing optimality + ∂l_∂w = [2*α*ŵ[i] - sum(X_train[:,i].*(Y_train - X_train*ŵ))/N for i in 1:D] + @assert norm(∂l_∂w) < 1e-1 + + push!( + ∂l_∂αs, + ∇model(model, X_train, w, ŵ, α) + ) +end + +if should_plot + plot(αs, ∂l_∂αs, label="∂l/∂α", xaxis = ("α")) +end + +""" + start from initial value of regularization constant + do gradient descent on alpha + until the MSE keeps on decreasing +""" +function descent(α, max_iters=25) + prev_mse = 1e7 + curr_mse = 1e6 + + α_s = Float64[] + mse = Float64[] + + iter=0 + κ = 0.01 + while curr_mse-10 < prev_mse && iter < max_iters + iter += 1 + model, w, _, ŵ = fitRidge(X_train, Y_train, α) + + #update + ∂α = ∇model(model, X_train, w, ŵ, α) + + α += κ*∂α # update step + + push!(α_s, α) + + Y_pred = X_test*ŵ + + prev_mse = curr_mse + curr_mse = sum((Y_pred - Y_test).^2) + + push!(mse, curr_mse) + end + + return α_s, mse +end + +ᾱ, msē = descent(1.0); + +if should_plot + plot(log.(αs), mse, label="MSE", xaxis = ("α")) + plot!(log.(ᾱ), msē, label="G.D. for α", lw = 2) +end diff --git a/examples/chainrules.jl b/examples/chainrules.jl index 5ac83747b..3c56cf4d8 100644 --- a/examples/chainrules.jl +++ b/examples/chainrules.jl @@ -3,6 +3,7 @@ using Clp using DiffOpt using Test using ChainRulesCore +using LinearAlgebra # This script creates the JuMP model for a small unit commitment instance # represented in a solution map function taking parameters as arguments and returning diff --git a/examples/custom-relu.jl b/examples/custom-relu.jl new file mode 100644 index 000000000..1421986c5 --- /dev/null +++ b/examples/custom-relu.jl @@ -0,0 +1,106 @@ +""" + Source code for the example given in custom-relu.md +""" + +using Statistics +using DiffOpt +using Flux +using Flux: onehotbatch, onecold, crossentropy, throttle +using Base.Iterators: repeated +using OSQP +using JuMP +using ChainRulesCore + +## prepare data +imgs = Flux.Data.MNIST.images() +labels = Flux.Data.MNIST.labels(); + +# Preprocessing +X = hcat(float.(reshape.(imgs, :))...) #stack all the images +Y = onehotbatch(labels, 0:9); # just a common way to encode categorical variables + +test_X = hcat(float.(reshape.(Flux.Data.MNIST.images(:test), :))...) +test_Y = onehotbatch(Flux.Data.MNIST.labels(:test), 0:9) + +# float64 to float16, to save memory +X = convert(Array{Float16,2}, X) +test_X = convert(Array{Float16,2}, test_X) + +X = X[:, 1:1000] +Y = Y[:, 1:1000]; + +""" + relu method for a Matrix +""" +function matrix_relu(y::AbstractMatrix{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T} + x̂ = zeros(T, size(y)) + + # model init + N = length(y[:, 1]) + empty!(model) + set_optimizer_attribute(model, MOI.Silent(), true) + @variable(model, x[1:N] >= zero(T)) + + for i in 1:size(y)[2] + @objective( + model, + Min, + x'x -2x'y[:, i] + ) + optimize!(model) + x̂[:, i] = value.(x) + end + return x̂ +end + +function ChainRulesCore.rrule(::typeof(matrix_relu), y::AbstractArray{T}; model = Model(() -> diff_optimizer(OSQP.Optimizer))) where {T} + + pv = matrix_relu(y, model=model) + + function pullback_matrix_relu(dx) + x = model[:x] + dy = zeros(T, size(dx)) + + for i in 1:size(y)[2] + MOI.set.( + model, + DiffOpt.BackwardInVariablePrimal(), + x, + dx[:, i] + ) + + DiffOpt.backward(model) # find grad + + # fetch the objective expression + obj_exp = MOI.get(model, DiffOpt.BackwardOutObjective()) + + dy[:, i] = JuMP.coefficient.(obj_exp, x) # coeff of `x` in -2x'y + dy[:, i] = -2 * dy[:, i] + end + + return (NO_FIELDS, dy) + end + return pv, pullback_matrix_relu +end + +m = Chain( + Dense(784, 64), + matrix_relu, + Dense(64, 10), + softmax +) + +custom_loss(x, y) = crossentropy(m(x), y) +opt = ADAM(); # popular stochastic gradient descent variant + +accuracy(x, y) = mean(onecold(m(x)) .== onecold(y)) # cute way to find average of correct guesses + +dataset = repeated((X,Y), 20) # repeat the data set, very low accuracy on the orig dataset +evalcb = () -> @show(custom_loss(X, Y)) # callback to show loss + +Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU + +@show accuracy(X,Y) +@show accuracy(test_X, test_Y); + + diff --git a/examples/custom-svm.jl b/examples/custom-svm.jl new file mode 100644 index 000000000..ba70748fa --- /dev/null +++ b/examples/custom-svm.jl @@ -0,0 +1,114 @@ +""" + Source code for the example given in custom-svm.md +""" + +using Statistics +using DiffOpt +using Flux +using Flux: onecold, binarycrossentropy, throttle, logitcrossentropy, crossentropy +using Base.Iterators: repeated +using JuMP +using SCS +using CSV +using DataFrames +using ChainRulesCore +using HTTP + +const DATA_URL = "https://raw.githubusercontent.com/be-apt/jump-gsoc-2020/master/titanic_preprocessed.csv" +labels = NaN; # hack for the SVM + +""" + SVM as a Flux layer +""" +function SVM(X::AbstractMatrix{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T} + D, N = size(X) + + Y = vec([y >= 0.5 ? 1 : -1 for y in labels]') + # scale class from 0,1 to -1,1. required for SVM + + # model init + empty!(model) + set_optimizer_attribute(model, MOI.Silent(), true) + + # add variables + @variable(model, l[1:N]) + @variable(model, w[1:D]) + @variable(model, b) + + @constraint(model, cons, Y.*(X'*w .+ b) + l.-1 ∈ MOI.Nonnegatives(N)) + @constraint(model, 1.0*l ∈ MOI.Nonnegatives(N)); + + @objective( + model, + Min, + sum(l), + ) + + optimize!(model) + + wv = value.(w) + bv = value(b) + + return (X'*wv .+ bv)' #prediction +end + +function ChainRulesCore.rrule(::typeof(SVM), X::AbstractArray{T}; model = Model(() -> diff_optimizer(SCS.Optimizer))) where {T} + + predictions = SVM(X, model=model) + + """ + model[:w], model[:b] are the weights of this layer + they are not updated using backward pass + since they can be computed to an accurate degree using a solver + """ + function pullback_SVM(dX) + dy = zeros(T, size(dX)) # since w# + return (NO_FIELDS, dy) + end + return predictions, pullback_SVM +end + +function fetchProblem(;split_ratio::Float64) + df = CSV.File(HTTP.get(DATA_URL).body) |> DataFrame + + Y = df[:, 2] + X = df[!, 3:12] + X = Matrix(X)' + + D, N = size(X) + + l = Int(floor(length(Y)*split_ratio)) + return X[:, 1:l], X[:, l+1:N], Y[1:l]', Y[l+1:N]' +end +X_train, X_test, Y_train, Y_test = fetchProblem(split_ratio=0.8) +D = size(X_train)[1]; + +m = Chain( + Dense(D, 16, relu), + Dropout(0.5), + SVM +# Dense(32, 1, σ), +); + +custom_loss(x, y) = logitcrossentropy(m(x), y) +opt = ADAM(); # popular stochastic gradient descent variant + +classify(x::Float64) = (x>=0.5) ? 1 : 0 + +function accuracy(x, y_true) + y_pred = classify.(m(x)) + return sum(y_true .≈ y_pred) / length(y_true) +end + +dataset = repeated((X_train,Y_train), 1) # repeat the data set, very low accuracy on the orig dataset +evalcb = () -> @show(custom_loss(X_train,Y_train)) # callback to show loss + +labels = Y_train # needed for SVM +for iter in 1:1 + Flux.train!(custom_loss, params(m), dataset, opt, cb = throttle(evalcb, 5)); #took me ~5 minutes to train on CPU +end + +@show accuracy(X_train, Y_train) + +labels = Y_test # needed for SVM +@show accuracy(X_test, Y_test) diff --git a/examples/sensitivity-SVM.jl b/examples/sensitivity-SVM.jl index 1c739a5b6..866b17950 100644 --- a/examples/sensitivity-SVM.jl +++ b/examples/sensitivity-SVM.jl @@ -10,6 +10,7 @@ using LinearAlgebra using MathOptInterface const MOI = MathOptInterface; +const MOIU = MOI.Utilities; # optional Plot to integrate in tests # set ENV["SVM_PLOT"] = "1" to build plots @@ -18,11 +19,11 @@ if should_plot using Plots end -N = 50 +N = 100 D = 2 -Random.seed!(rand(1:100)) -X = vcat(randn(N, D), randn(N,D) .+ [4.0,1.5]') -y = append!(ones(N), -ones(N)); +Random.seed!(62) +X = vcat(randn(N÷2, D), randn(N÷2,D) .+ [4.0,1.5]') +y = append!(ones(N÷2), -ones(N÷2)); model = diff_optimizer(SCS.Optimizer) diff --git a/examples/sensitivity-analysis-ridge.jl b/examples/sensitivity-analysis-ridge.jl new file mode 100644 index 000000000..2f72c8931 --- /dev/null +++ b/examples/sensitivity-analysis-ridge.jl @@ -0,0 +1,109 @@ +""" + Source code for the example given in sensitivity-analysis-ridge.md +""" + +import Random +import OSQP +using DiffOpt +using JuMP +using LinearAlgebra + +# optional Plot to integrate in tests +# set ENV["BUILD_PLOT"] = "1" to build plots +const should_plot = get(ENV, "BUILD_PLOT", nothing) == "1" +if should_plot + using Plots +end + +function create_problem(N=100) + m = 2*abs(randn()) + b = rand() + X = randn(N) + Y = m*X .+ b + 0.8*randn(N) + + return X, Y +end + +X, Y = create_problem(); + +function fitRidge(X,Y,alpha=0.1) + model = Model(() -> diff_optimizer(OSQP.Optimizer)) + + # add variables + @variable(model, w) + @variable(model, b) + set_optimizer_attribute(model, MOI.Silent(), true) + + @objective( + model, + Min, + sum((Y - w*X .- b).*(Y - w*X .- b)) + alpha*(sum(w*w)+sum(b*b)), + ) + + optimize!(model) + + loss = objective_value(model) + return model, w, b, loss, value(w), value(b) +end + +model, w, b, loss_train, ŵ, b̂ = fitRidge(X, Y) + +# plot the regressing line +if should_plot + p = Plots.scatter(X, Y, label="") + mi, ma = minimum(X), maximum(X) + Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="") +end + +# get the gradients +∇ = zero(X) +for i in 1:length(X) + # MOI.set( + # model, + # DiffOpt.ForwardInObjective(), + # w, + # -2*(Y[i] + X[i]) + # ) + # MOI.set( + # model, + # DiffOpt.ForwardInObjective(), + # w, + # w, + # 2*X[i] + # ) + + MOI.set( + model, + DiffOpt.ForwardInObjective(), + MOI.ScalarQuadraticFunction( + [MOI.ScalarAffineTerm(-2(Y[i] + X[i]), w.index)], + [MOI.ScalarQuadraticTerm(2X[i], w.index, w.index)], + 0.0 + ) + ) + + DiffOpt.forward(model) + + db = MOI.get( + model, + DiffOpt.ForwardOutVariablePrimal(), + b + ) + + ∇[i] = db +end +normalize!(∇); + + +# plot the data-point sensitivities +if should_plot + p = Plots.scatter( + X, Y, + color=[x>0 ? :red : :blue for x in ∇], + markersize=[25*abs(x) for x in ∇], + label="" + ) + mi, ma = minimum(X), maximum(X) + Plots.plot!(p, [mi, ma], [mi*ŵ+b̂, ma*ŵ+b̂], color=:red, label="") +end + diff --git a/examples/solve-LP.jl b/examples/solve-LP.jl deleted file mode 100644 index 45c9473f9..000000000 --- a/examples/solve-LP.jl +++ /dev/null @@ -1,73 +0,0 @@ -using Random -using GLPK -using MathOptInterface - -const MOI = MathOptInterface -const MOIU = MathOptInterface.Utilities; - -using Test - -D = 10 # variable dimension -N = 20 # no of inequality constraints - -s = rand(N) -s = 2*s.-1 -λ = max.(-s, 0) -s = max.(s, 0) -x̂ = rand(D) -A = rand(N, D) -b = A*x̂ + s -c = -A' * λ - -# can feed dual problem to optimizer like this: -# model = MOI.instantiate(dual_optimizer(GLPK.Optimizer), with_bridge_type=Float64) - -model = GLPK.Optimizer() -x = MOI.add_variables(model, D) - -# define objective -objective_function = MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(c, x), 0.0) -MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarAffineFunction{Float64}}(), objective_function) -MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE) - -# will be useful later -constraint_indices = [] - -# set constraints -for i in 1:N - push!(constraint_indices, MOI.add_constraint(model,MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(A[i,:], x), 0.),MOI.LessThan(b[i]))) -end - -for i in 1:D - push!(constraint_indices, MOI.add_constraint(model,MOI.SingleVariable(x[i]),MOI.GreaterThan(0.))) -end - -MOI.optimize!(model) - -@assert MOI.get(model, MOI.TerminationStatus()) in [MOI.LOCALLY_SOLVED, MOI.OPTIMAL] - -x̄ = MOI.get(model, MOI.VariablePrimal(), x); # solution - -@assert abs(c'x̄ - c'x̂) <= 1e-8 # sanity check - - -## -## Checking KKT conditions -## - -is_less_than(set::S) where {S<:MOI.AbstractSet} = false -is_less_than(set::MOI.LessThan{T}) where T = true - -for con_index in constraint_indices - con_value = MOI.get(model, MOI.ConstraintPrimal(), con_index) - set = MOI.get(model, MOI.ConstraintSet(), con_index) - μ = MOI.get(model, MOI.ConstraintDual(), con_index) - - if is_less_than(set) - # μ[i]*(Ax - b)[i] = 0 - @assert μ*(con_value - set.upper) < 1e-10 - else - # μ[j]*x[j] = 0 - @assert μ*(con_value - set.lower) < 1e-10 - end -end diff --git a/examples/solve-QP.jl b/examples/solve-QP.jl deleted file mode 100644 index 926e21241..000000000 --- a/examples/solve-QP.jl +++ /dev/null @@ -1,86 +0,0 @@ -using Random -using Test - -using MathOptInterface -const MOI = MathOptInterface -const MOIU = MathOptInterface.Utilities; - -using Ipopt - -n = 20 # variable dimension -m = 15 # no of inequality constraints - -x̂ = rand(n) -Q = rand(n, n) -Q = Q' * Q # ensure PSD -q = rand(n) -G = rand(m, n) -h = G * x̂ + rand(m); - -model = MOI.instantiate(Ipopt.Optimizer, with_bridge_type=Float64) -x = MOI.add_variables(model, n); - -# define objective - -quad_terms = MOI.ScalarQuadraticTerm{Float64}[] -for i in 1:n - for j in i:n # indexes (i,j), (j,i) will be mirrored. specify only one kind - push!(quad_terms, MOI.ScalarQuadraticTerm(Q[i,j],x[i],x[j])) - end -end - -objective_function = MOI.ScalarQuadraticFunction(MOI.ScalarAffineTerm.(q, x),quad_terms,0.0) -MOI.set(model, MOI.ObjectiveFunction{MOI.ScalarQuadraticFunction{Float64}}(), objective_function) -MOI.set(model, MOI.ObjectiveSense(), MOI.MIN_SENSE) - -# maintain constrain to index map - will be useful later -constraint_indices = [ - MOI.add_constraint( - model, - MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(G[i,:], x), 0.),MOI.LessThan(h[i]), - ) for i in 1:m -] - - -MOI.optimize!(model) - -@assert MOI.get(model, MOI.TerminationStatus()) in (MOI.LOCALLY_SOLVED, MOI.OPTIMAL) - -x̄ = MOI.get(model, MOI.VariablePrimal(), x) - -# objective value (predicted vs actual) sanity check -@test 0.5*x̄'*Q*x̄ + q'*x̄ <= 0.5*x̂'*Q*x̂ + q'*x̂ - -# NOTE: can't use Ipopt -# Ipopt.Optimizer doesn't supports accessing MOI.ObjectiveFunctionType - -# -# Verifying KKT Conditions -# - -# complimentary slackness + dual feasibility -for i in 1:size(constraint_indices)[1] - con_index = constraint_indices[i] - μ_i = MOI.get(model, MOI.ConstraintDual(), con_index) - - # μ[i] * (G * x - h)[i] = 0 - @test abs(μ_i * (G[i,:]' * x̄ - h[i])) < 3e-2 - - # μ[i] <= 0 - @test μ_i <= 1e-2 -end - - -# checking stationarity -for j in 1:n - G_mu_sum = 0 - - for i in 1:size(constraint_indices)[1] - con_index = constraint_indices[i] - μ_i = MOI.get(model, MOI.ConstraintDual(), con_index) - - G_mu_sum += μ_i * G[i,j] - end - - @test abs(G_mu_sum - (Q * x̄ + q)[j]) < 1e-2 -end diff --git a/src/diff_opt.jl b/src/diff_opt.jl index 8339144ec..8b74c243b 100644 --- a/src/diff_opt.jl +++ b/src/diff_opt.jl @@ -128,7 +128,7 @@ A `MOI.ScalarQuadraticFunction` can only be used in linearly constrained quadratic models. For instance, if the objective contains `θ * (x + 2y)`, for the purpose of -computinig the derivative with respect to `θ`, the following should be set: +computing the derivative with respect to `θ`, the following should be set: ```julia fx = MOI.SingleVariable(x) fy = MOI.SingleVariable(y) diff --git a/test/runtests.jl b/test/runtests.jl index dfeae4f8c..756c67d5d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,11 +23,14 @@ const ATOL = 1e-4 const RTOL = 1e-4 @testset "Examples" begin - include(joinpath(@__DIR__, "../examples/solve-LP.jl")) - include(joinpath(@__DIR__, "../examples/solve-QP.jl")) + include(joinpath(@__DIR__, "../examples/autotuning-ridge.jl")) + include(joinpath(@__DIR__, "../examples/chainrules.jl")) + include(joinpath(@__DIR__, "../examples/custom-relu.jl")) + include(joinpath(@__DIR__, "../examples/custom-svm.jl")) include(joinpath(@__DIR__, "../examples/unit_example.jl")) + include(joinpath(@__DIR__, "../examples/sensitivity-analysis-ridge.jl")) include(joinpath(@__DIR__, "../examples/sensitivity-SVM.jl")) - include(joinpath(@__DIR__, "../examples/chainrules.jl")) + include(joinpath(@__DIR__, "../examples/unit_example.jl")) end @testset "Generate random problems" begin